import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn import base
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix
from IPython.display import Image
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import mutual_info_regression, f_classif, SelectKBest
import re
from scipy import stats
from sklearn.impute import KNNImputer, SimpleImputer
sns.set_style('whitegrid')
pd.options.mode.chained_assignment = None
from numpy import percentile
%matplotlib inline
from statsmodels.formula.api import ols
import statsmodels.api as sm
# To import dataset
Weather = pd.read_csv('weatherAUS.csv')
#per data description we are to drop RISK_MM to prevent data leakage
Weather.drop(['RISK_MM'], axis=1, inplace=True)
# To show number of rows and columns in the dataset
Weather.shape
(142193, 23)
# To reduce the dataset and by 80% due to its large size and make computationally quicker to process
data_subset, _ = train_test_split(Weather,test_size = 0.80,random_state=20201202)
To create a copy of this subset to avoid errors
Weather_subset = data_subset.copy(deep = True)
# To show number of rows and columns in the new/subset dataset
Weather_subset.shape
(28438, 23)
# For dataset
Weather_subset.head(5).T
| 33491 | 16277 | 57069 | 96349 | 105217 | |
|---|---|---|---|---|---|
| Date | 2010-10-20 | 2013-07-05 | 2009-12-03 | 2015-11-02 | 2015-05-27 |
| Location | SydneyAirport | Newcastle | Bendigo | Adelaide | Woomera |
| MinTemp | 12.8 | 9 | 10.3 | 15.5 | 11.7 |
| MaxTemp | 21.6 | 21.7 | 32.9 | 23.6 | 23.5 |
| Rainfall | 0 | 0.2 | 0 | 0 | 0 |
| Evaporation | 3 | NaN | 5.6 | NaN | 3.6 |
| Sunshine | 10.9 | NaN | NaN | NaN | 3.1 |
| WindGustDir | NE | NaN | NW | WSW | NW |
| WindGustSpeed | 43 | NaN | 63 | 30 | 31 |
| WindDir9am | WNW | NW | NNE | NNW | NaN |
| WindDir3pm | ENE | SW | NW | WSW | WNW |
| WindSpeed9am | 9 | 9 | 13 | 9 | 0 |
| WindSpeed3pm | 30 | 4 | 28 | 13 | 20 |
| Humidity9am | 62 | 70 | 33 | 74 | 52 |
| Humidity3pm | 56 | 40 | 12 | 47 | 33 |
| Pressure9am | 1029 | NaN | 1016.1 | 1014 | 1022.1 |
| Pressure3pm | 1025.6 | NaN | 1010.8 | 1014.2 | 1019.2 |
| Cloud9am | 3 | 0 | 4 | NaN | 5 |
| Cloud3pm | 1 | 5 | 7 | NaN | 6 |
| Temp9am | 18.2 | 15.5 | 22.9 | 17.8 | 17 |
| Temp3pm | 19.8 | 20.8 | 32.5 | 22 | 22.6 |
| RainToday | No | No | No | No | No |
| RainTomorrow | No | No | No | No | No |
# For details on each column
Weather_subset.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MinTemp | 28296.0 | 12.146409 | 6.433939 | -8.2 | 7.5 | 11.9 | 16.8 | 30.3 |
| MaxTemp | 28361.0 | 23.195042 | 7.090857 | -3.1 | 17.9 | 22.6 | 28.2 | 48.1 |
| Rainfall | 28147.0 | 2.384940 | 8.907185 | 0.0 | 0.0 | 0.0 | 0.7 | 371.0 |
| Evaporation | 16223.0 | 5.447920 | 4.183820 | 0.0 | 2.6 | 4.8 | 7.4 | 86.2 |
| Sunshine | 14832.0 | 7.647101 | 3.770144 | 0.0 | 4.9 | 8.5 | 10.6 | 14.5 |
| WindGustSpeed | 26571.0 | 39.962440 | 13.552411 | 6.0 | 31.0 | 39.0 | 48.0 | 126.0 |
| WindSpeed9am | 28161.0 | 13.972586 | 8.868169 | 0.0 | 7.0 | 13.0 | 19.0 | 130.0 |
| WindSpeed3pm | 27888.0 | 18.637658 | 8.822201 | 0.0 | 13.0 | 19.0 | 24.0 | 78.0 |
| Humidity9am | 28082.0 | 68.964390 | 18.968322 | 3.0 | 57.0 | 70.0 | 83.0 | 100.0 |
| Humidity3pm | 27724.0 | 51.449394 | 20.783523 | 0.0 | 36.0 | 52.0 | 65.0 | 100.0 |
| Pressure9am | 25560.0 | 1017.642641 | 7.085379 | 982.0 | 1012.9 | 1017.6 | 1022.4 | 1040.4 |
| Pressure3pm | 25569.0 | 1015.238123 | 7.013010 | 977.1 | 1010.4 | 1015.2 | 1020.0 | 1038.9 |
| Cloud9am | 17666.0 | 4.433658 | 2.887697 | 0.0 | 1.0 | 5.0 | 7.0 | 8.0 |
| Cloud3pm | 16988.0 | 4.494172 | 2.718872 | 0.0 | 2.0 | 5.0 | 7.0 | 8.0 |
| Temp9am | 28248.0 | 16.935907 | 6.486651 | -7.0 | 12.2 | 16.7 | 21.5 | 39.0 |
| Temp3pm | 27887.0 | 21.666726 | 6.921655 | -4.2 | 16.6 | 21.1 | 26.4 | 46.2 |
# To provide extra info on the data
Weather_subset.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 28438 entries, 33491 to 17089 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 28438 non-null object 1 Location 28438 non-null object 2 MinTemp 28296 non-null float64 3 MaxTemp 28361 non-null float64 4 Rainfall 28147 non-null float64 5 Evaporation 16223 non-null float64 6 Sunshine 14832 non-null float64 7 WindGustDir 26564 non-null object 8 WindGustSpeed 26571 non-null float64 9 WindDir9am 26468 non-null object 10 WindDir3pm 27672 non-null object 11 WindSpeed9am 28161 non-null float64 12 WindSpeed3pm 27888 non-null float64 13 Humidity9am 28082 non-null float64 14 Humidity3pm 27724 non-null float64 15 Pressure9am 25560 non-null float64 16 Pressure3pm 25569 non-null float64 17 Cloud9am 17666 non-null float64 18 Cloud3pm 16988 non-null float64 19 Temp9am 28248 non-null float64 20 Temp3pm 27887 non-null float64 21 RainToday 28147 non-null object 22 RainTomorrow 28438 non-null object dtypes: float64(16), object(7) memory usage: 5.2+ MB
# To define and show column names
col_names = Weather.columns
col_names
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'RainTomorrow'],
dtype='object')
# To show columns as dataset 'object'
Weather_subset.describe(include=['object'])
| Date | Location | WindGustDir | WindDir9am | WindDir3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|
| count | 28438 | 28438 | 26564 | 26468 | 27672 | 28147 | 28438 |
| unique | 3241 | 49 | 16 | 16 | 16 | 2 | 2 |
| top | 2016-02-15 | Canberra | W | N | SE | No | No |
| freq | 20 | 703 | 1957 | 2321 | 2122 | 21894 | 22041 |
#To remove date as dtype object and replace as integer in the form of day, month and year for ease of reference using 'datetime'
Weather_subset['Date'] = pd.to_datetime(Weather_subset['Date'])
# extract year from date
Weather_subset['Year'] = Weather_subset['Date'].dt.year
# extract month from date
Weather_subset['Month'] = Weather_subset['Date'].dt.month
# extract day from date
Weather_subset['Day'] = Weather_subset['Date'].dt.day
# Remove 'Date' column as it corresponds to the three newly created columns above
Weather_subset.drop(['Date'], axis=1, inplace=True)
# To assign 1 = Rain tomorrow and 0 = No Rain Tomorrow. This is our dependent (y) variable
Weather_subset["RainTomorrow"] = [1 if each == "Yes" else 0 for each in Weather_subset["RainTomorrow"]]
# To show updated data type for each column
Weather_subset.dtypes
Location object MinTemp float64 MaxTemp float64 Rainfall float64 Evaporation float64 Sunshine float64 WindGustDir object WindGustSpeed float64 WindDir9am object WindDir3pm object WindSpeed9am float64 WindSpeed3pm float64 Humidity9am float64 Humidity3pm float64 Pressure9am float64 Pressure3pm float64 Cloud9am float64 Cloud3pm float64 Temp9am float64 Temp3pm float64 RainToday object RainTomorrow int64 Year int64 Month int64 Day int64 dtype: object
# To divide Categorical, Numerical and Other columns into separate groups for review and action
def basic_info(data):
print("Dataset shape is: ", data.shape)
print("Dataset size is: ", data.size)
print("Dataset columns are: ", data.columns)
print("Dataset info is: ", data.info())
categorical = []
numerical = []
others = []
for i in data.columns:
if data[i].dtype == object:
categorical.append(i)
elif data[i].dtype == float:
numerical.append(i)
else:
others.append(i)
print("Categorical variables are:\n ", categorical)
print("Numerical variables are:\n ", numerical)
print("Other variables are:\n ", others)
return categorical, numerical, others
categorical, numerical, others = basic_info(Weather_subset)
Dataset shape is: (28438, 25)
Dataset size is: 710950
Dataset columns are: Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'RainTomorrow', 'Year', 'Month', 'Day'],
dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28438 entries, 33491 to 17089
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Location 28438 non-null object
1 MinTemp 28296 non-null float64
2 MaxTemp 28361 non-null float64
3 Rainfall 28147 non-null float64
4 Evaporation 16223 non-null float64
5 Sunshine 14832 non-null float64
6 WindGustDir 26564 non-null object
7 WindGustSpeed 26571 non-null float64
8 WindDir9am 26468 non-null object
9 WindDir3pm 27672 non-null object
10 WindSpeed9am 28161 non-null float64
11 WindSpeed3pm 27888 non-null float64
12 Humidity9am 28082 non-null float64
13 Humidity3pm 27724 non-null float64
14 Pressure9am 25560 non-null float64
15 Pressure3pm 25569 non-null float64
16 Cloud9am 17666 non-null float64
17 Cloud3pm 16988 non-null float64
18 Temp9am 28248 non-null float64
19 Temp3pm 27887 non-null float64
20 RainToday 28147 non-null object
21 RainTomorrow 28438 non-null int64
22 Year 28438 non-null int64
23 Month 28438 non-null int64
24 Day 28438 non-null int64
dtypes: float64(16), int64(4), object(5)
memory usage: 5.6+ MB
Dataset info is: None
Categorical variables are:
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
Numerical variables are:
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
Other variables are:
['RainTomorrow', 'Year', 'Month', 'Day']
#To show numerical columns to be investigated
print("Numerical variables are:\n ", numerical)
Numerical variables are: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
#Calculate mean
average = Weather_subset["MinTemp"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["MinTemp"].median()
print("The middle point/median is", middle_point)
#define max and min
MinTemp_total = Weather_subset.MinTemp
MinTemp_total_min, MinTemp_total_max = MinTemp_total.min(), MinTemp_total.max()
print("The minimum value is", MinTemp_total_min)
print("The maximum value is", MinTemp_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["MinTemp"].std()
print("The standard deviation is", deviation)
range = MinTemp_total_max - MinTemp_total_min
print("The range is", range)
The mean is 12.146409386485724 The middle point/median is 11.9 The minimum value is -8.2 The maximum value is 30.3 The standard deviation is 6.433939033430801 The range is 38.5
#Calculate mean
average = Weather_subset["MaxTemp"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["MaxTemp"].median()
print("The middle point/median is", middle_point)
#define max and min
MaxTemp_total = Weather_subset.MaxTemp
MaxTemp_total_min, MaxTemp_total_max = MaxTemp_total.min(), MaxTemp_total.max()
print("The minimum value is", MaxTemp_total_min)
print("The maximum value is", MaxTemp_total_max)
#Calculate MaxTemp deviation and range
deviation = Weather_subset["MaxTemp"].std()
print("The standard deviation is", deviation)
range = MaxTemp_total_max - MaxTemp_total_min
print("The range is", range)
The mean is 23.195042487923555 The middle point/median is 22.6 The minimum value is -3.1 The maximum value is 48.1 The standard deviation is 7.0908566719270585 The range is 51.2
#Calculate mean
average = Weather_subset["Rainfall"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Rainfall"].median()
print("The middle point/median is", middle_point)
#define max and min
Rainfall_total = Weather_subset.Rainfall
Rainfall_total_min, Rainfall_total_max = Rainfall_total.min(), Rainfall_total.max()
print("The minimum value is", Rainfall_total_min)
print("The maximum value is", Rainfall_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Rainfall"].std()
print("The standard deviation is", deviation)
range = Rainfall_total_max - Rainfall_total_min
print("The range is", range)
The mean is 2.3849397804384123 The middle point/median is 0.0 The minimum value is 0.0 The maximum value is 371.0 The standard deviation is 8.907184833625912 The range is 371.0
#Calculate mean
average = Weather_subset["Evaporation"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Evaporation"].median()
print("The middle point/median is", middle_point)
#define max and min
Evaporation_total = Weather_subset.Evaporation
Evaporation_total_min, Evaporation_total_max = Evaporation_total.min(), Evaporation_total.max()
print("The minimum value is", Evaporation_total_min)
print("The maximum value is", Evaporation_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Evaporation"].std()
print("The standard deviation is", deviation)
range = Evaporation_total_max - Evaporation_total_min
print("The range is", range)
The mean is 5.447919620292177 The middle point/median is 4.8 The minimum value is 0.0 The maximum value is 86.2 The standard deviation is 4.183819594480705 The range is 86.2
#Calculate mean
average = Weather_subset["Sunshine"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Sunshine"].median()
print("The middle point/median is", middle_point)
#define max and min
Sunshine_total = Weather_subset.Sunshine
Sunshine_total_min, Sunshine_total_max = Sunshine_total.min(), Sunshine_total.max()
print("The minimum value is", Sunshine_total_min)
print("The maximum value is", Sunshine_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Sunshine"].std()
print("The standard deviation is", deviation)
range = Sunshine_total_max - Sunshine_total_min
print("The range is", range)
The mean is 7.647100862998921 The middle point/median is 8.5 The minimum value is 0.0 The maximum value is 14.5 The standard deviation is 3.770144288142796 The range is 14.5
#Calculate mean
average = Weather_subset["WindGustSpeed"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["WindGustSpeed"].median()
print("The middle point/median is", middle_point)
#define max and min
WindGustSpeed_total = Weather_subset.WindGustSpeed
WindGustSpeed_total_min, WindGustSpeed_total_max = WindGustSpeed_total.min(), WindGustSpeed_total.max()
print("The minimum value is", WindGustSpeed_total_min)
print("The maximum value is", WindGustSpeed_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["WindGustSpeed"].std()
print("The standard deviation is", deviation)
range = WindGustSpeed_total_max - WindGustSpeed_total_min
print("The range is", range)
The mean is 39.962440254412705 The middle point/median is 39.0 The minimum value is 6.0 The maximum value is 126.0 The standard deviation is 13.55241086476566 The range is 120.0
#Calculate mean
average = Weather_subset["WindSpeed9am"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["WindSpeed9am"].median()
print("The middle point/median is", middle_point)
#define max and min
WindSpeed9am_total = Weather_subset.WindSpeed9am
WindSpeed9am_total_min, WindSpeed9am_total_max = WindSpeed9am_total.min(), WindSpeed9am_total.max()
print("The minimum value is", WindSpeed9am_total_min)
print("The maximum value is", WindSpeed9am_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["WindSpeed9am"].std()
print("The standard deviation is", deviation)
range = WindSpeed9am_total_max - WindSpeed9am_total_min
print("The range is", range)
The mean is 13.97258620077412 The middle point/median is 13.0 The minimum value is 0.0 The maximum value is 130.0 The standard deviation is 8.86816885898493 The range is 130.0
#Calculate mean
average = Weather_subset["WindSpeed3pm"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["WindSpeed3pm"].median()
print("The middle point/median is", middle_point)
#define max and min
WindSpeed3pm_total = Weather_subset.WindSpeed3pm
WindSpeed3pm_total_min, WindSpeed3pm_total_max = WindSpeed3pm_total.min(), WindSpeed3pm_total.max()
print("The minimum value is", WindSpeed3pm_total_min)
print("The maximum value is", WindSpeed3pm_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["WindSpeed3pm"].std()
print("The standard deviation is", deviation)
range = WindSpeed3pm_total_max - WindSpeed3pm_total_min
print("The range is", range)
The mean is 18.637657773952956 The middle point/median is 19.0 The minimum value is 0.0 The maximum value is 78.0 The standard deviation is 8.822201014810105 The range is 78.0
#Calculate mean
average = Weather_subset["Humidity9am"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Humidity9am"].median()
print("The middle point/median is", middle_point)
#define max and min
Humidity9am_total = Weather_subset.Humidity9am
Humidity9am_total_min, Humidity9am_total_max = Humidity9am_total.min(), Humidity9am_total.max()
print("The minimum value is", Humidity9am_total_min)
print("The maximum value is", Humidity9am_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Humidity9am"].std()
print("The standard deviation is", deviation)
range = Humidity9am_total_max - Humidity9am_total_min
print("The range is", range)
The mean is 68.9643900007122 The middle point/median is 70.0 The minimum value is 3.0 The maximum value is 100.0 The standard deviation is 18.968322101119423 The range is 97.0
#Calculate mean
average = Weather_subset["Humidity3pm"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Humidity3pm"].median()
print("The middle point/median is", middle_point)
#define max and min
Humidity3pm_total = Weather_subset.Humidity3pm
Humidity3pm_total_min, Humidity3pm_total_max = Humidity3pm_total.min(), Humidity3pm_total.max()
print("The minimum value is", Humidity3pm_total_min)
print("The maximum value is", Humidity3pm_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Humidity3pm"].std()
print("The standard deviation is", deviation)
range = Humidity3pm_total_max - Humidity3pm_total_min
print("The range is", range)
The mean is 51.449394026835954 The middle point/median is 52.0 The minimum value is 0.0 The maximum value is 100.0 The standard deviation is 20.783523256076894 The range is 100.0
#Calculate mean
average = Weather_subset["Pressure9am"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Pressure9am"].median()
print("The middle point/median is", middle_point)
#define max and min
Pressure9am_total = Weather_subset.Pressure9am
Pressure9am_total_min, Pressure9am_total_max = Pressure9am_total.min(), Pressure9am_total.max()
print("The minimum value is", Pressure9am_total_min)
print("The maximum value is", Pressure9am_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Pressure9am"].std()
print("The standard deviation is", deviation)
range = Pressure9am_total_max - Pressure9am_total_min
print("The range is", range)
The mean is 1017.6426408450703 The middle point/median is 1017.6 The minimum value is 982.0 The maximum value is 1040.4 The standard deviation is 7.085378932290495 The range is 58.40000000000009
#Calculate mean
average = Weather_subset["Pressure3pm"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Pressure3pm"].median()
print("The middle point/median is", middle_point)
#define max and min
Pressure3pm_total = Weather_subset.Pressure3pm
Pressure3pm_total_min, Pressure3pm_total_max = Pressure3pm_total.min(), Pressure3pm_total.max()
print("The minimum value is", Pressure3pm_total_min)
print("The maximum value is", Pressure3pm_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Pressure3pm"].std()
print("The standard deviation is", deviation)
range = Pressure3pm_total_max - Pressure3pm_total_min
print("The range is", range)
The mean is 1015.2381227267394 The middle point/median is 1015.2 The minimum value is 977.1 The maximum value is 1038.9 The standard deviation is 7.013009882684533 The range is 61.80000000000007
#Calculate mean
average = Weather_subset["Cloud9am"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Cloud9am"].median()
print("The middle point/median is", middle_point)
#define max and min
Cloud9am_total = Weather_subset.Cloud9am
Cloud9am_total_min, Cloud9am_total_max = Cloud9am_total.min(), Cloud9am_total.max()
print("The minimum value is", Cloud9am_total_min)
print("The maximum value is", Cloud9am_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Cloud9am"].std()
print("The standard deviation is", deviation)
range = Cloud9am_total_max - Cloud9am_total_min
print("The range is", range)
The mean is 4.433657873882034 The middle point/median is 5.0 The minimum value is 0.0 The maximum value is 8.0 The standard deviation is 2.8876965261576775 The range is 8.0
#Calculate mean
average = Weather_subset["Cloud3pm"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Cloud3pm"].median()
print("The middle point/median is", middle_point)
#define max and min
Cloud3pm_total = Weather_subset.Cloud3pm
Cloud3pm_total_min, Cloud3pm_total_max = Cloud3pm_total.min(), Cloud3pm_total.max()
print("The minimum value is", Cloud3pm_total_min)
print("The maximum value is", Cloud3pm_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Cloud3pm"].std()
print("The standard deviation is", deviation)
range = Cloud3pm_total_max - Cloud3pm_total_min
print("The range is", range)
The mean is 4.494172356957852 The middle point/median is 5.0 The minimum value is 0.0 The maximum value is 8.0 The standard deviation is 2.718872198300328 The range is 8.0
#Calculate mean
average = Weather_subset["Temp9am"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Temp9am"].median()
print("The middle point/median is", middle_point)
#define max and min
Temp9am_total = Weather_subset.Temp9am
Temp9am_total_min, Temp9am_total_max = Temp9am_total.min(), Temp9am_total.max()
print("The minimum value is", Temp9am_total_min)
print("The maximum value is", Temp9am_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Temp9am"].std()
print("The standard deviation is", deviation)
range = Temp9am_total_max - Temp9am_total_min
print("The range is", range)
The mean is 16.93590696686491 The middle point/median is 16.7 The minimum value is -7.0 The maximum value is 39.0 The standard deviation is 6.486650906416592 The range is 46.0
#Calculate mean
average = Weather_subset["Temp3pm"].mean()
print("The mean is", average)
#Calculate median
middle_point = Weather_subset["Temp3pm"].median()
print("The middle point/median is", middle_point)
#define max and min
Temp3pm_total = Weather_subset.Temp3pm
Temp3pm_total_min, Temp3pm_total_max = Temp3pm_total.min(), Temp3pm_total.max()
print("The minimum value is", Temp3pm_total_min)
print("The maximum value is", Temp3pm_total_max)
#Calculate standard deviation and range
deviation = Weather_subset["Temp3pm"].std()
print("The standard deviation is", deviation)
range = Temp3pm_total_max - Temp3pm_total_min
print("The range is", range)
The mean is 21.66672643167067 The middle point/median is 21.1 The minimum value is -4.2 The maximum value is 46.2 The standard deviation is 6.921654674299337 The range is 50.400000000000006
# Here we can see more clear the relation between the rain of the day with the pressure and temperature by months, where months with higher atmosphere pressure posses bigger indices of rain.
#This is one of the variables we will work with on the model
Weather_tembypress = Weather_subset.groupby(["Month", "RainToday"])[["Pressure3pm","Temp3pm"]].apply(sum)
plt.figure(figsize = (13,9))
sns.lineplot(x="Month", y=Weather_tembypress.Pressure3pm/Weather_tembypress.Temp3pm*100, hue = "RainToday",
data = Weather_tembypress, linewidth = 2.5, style = "RainToday", markers=True, dashes=False)
plt.title("Overview of rain cases related to pressure and temperature" )
plt.xticks(rotation = 90)
plt.show()
# To show Categorical columns to be investigated
print("Categorical variables are:\n ", categorical)
Categorical variables are: ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
# check for cardinality of categorical variables
for var in categorical:
print(var, ' contains ', len(Weather_subset[var].unique()), ' labels')
Location contains 49 labels WindGustDir contains 17 labels WindDir9am contains 17 labels WindDir3pm contains 17 labels RainToday contains 3 labels
#Calculate Location mode
Most_common_value = Weather_subset["Location"].mode()
print("The Most common value/mode for Location is", Most_common_value)
#Calculate WindGustDir mode
Most_common_value = Weather_subset["WindGustDir"].mode()
print("The Most common value/mode for WindGustDir is", Most_common_value)
#Calculate WindDir9am mode
Most_common_value = Weather_subset["WindDir9am"].mode()
print("The Most common value/mode for WindDir9am is", Most_common_value)
#Calculate WindDir3pm mode
Most_common_value = Weather_subset["WindDir3pm"].mode()
print("The Most common value/mode for WindDir3pm is", Most_common_value)
#Calculate RainToday mode
Most_common_value = Weather_subset["RainToday"].mode()
print("The Most common value/mode for RainToday is", Most_common_value)
The Most common value/mode for Location is 0 Canberra dtype: object The Most common value/mode for WindGustDir is 0 W dtype: object The Most common value/mode for WindDir9am is 0 N dtype: object The Most common value/mode for WindDir3pm is 0 SE dtype: object The Most common value/mode for RainToday is 0 No dtype: object
# To show number of values in each category for Location
Weather_subset['Location'].value_counts()
Canberra 703 Sydney 681 MountGambier 651 Darwin 648 GoldCoast 634 NorfolkIsland 632 Tuggeranong 627 Penrith 624 Hobart 623 Albury 622 Brisbane 621 MountGinini 619 Perth 613 Bendigo 612 MelbourneAirport 611 Mildura 610 Nuriootpa 609 Woomera 608 Newcastle 606 CoffsHarbour 605 Sale 602 Ballarat 601 Portland 600 BadgerysCreek 598 Cobar 598 Townsville 597 Adelaide 595 SydneyAirport 595 Watsonia 591 AliceSprings 588 PerthAirport 587 Richmond 586 Launceston 583 WaggaWagga 582 Dartmoor 580 Wollongong 580 Moree 576 Cairns 576 NorahHead 568 SalmonGums 565 Albany 565 Walpole 564 Witchcliffe 563 PearceRAAF 557 Williamtown 526 Melbourne 461 Nhil 315 Uluru 303 Katherine 277 Name: Location, dtype: int64
# To visualise distribution of each category for Location
plt.figure(figsize=(25, 4))
sns.countplot(Weather_subset['Location'])
plt.xticks(rotation=-25)
plt.show()
# To show number of values in each category for WindGustDir
Weather_subset['WindGustDir'].value_counts()
W 1957 N 1855 E 1850 SE 1849 SSE 1801 S 1787 WSW 1773 SW 1730 SSW 1703 WNW 1691 ENE 1607 NW 1596 ESE 1442 NE 1390 NNE 1267 NNW 1266 Name: WindGustDir, dtype: int64
# To visualise distribution of each category for WindGustDir
plt.figure(figsize=(25, 4))
sns.countplot(Weather_subset['WindGustDir'])
plt.xticks(rotation=-25)
plt.show()
# To show number of values in each category for WindDir9am
Weather_subset['WindDir9am'].value_counts()
N 2321 SE 1849 E 1822 SSE 1763 S 1692 W 1688 NW 1682 SW 1651 NNW 1599 NNE 1565 ENE 1552 SSW 1521 ESE 1510 NE 1497 WNW 1412 WSW 1344 Name: WindDir9am, dtype: int64
# To visualise distribution of each category for WindDir9am
plt.figure(figsize=(25, 4))
sns.countplot(Weather_subset['WindDir9am'])
plt.xticks(rotation=-25)
plt.show()
# To show number of values in each category for WindDir3pm
Weather_subset['WindDir3pm'].value_counts()
SE 2122 W 1968 WSW 1938 S 1911 SSE 1800 SW 1770 WNW 1765 NW 1747 N 1697 E 1685 ESE 1641 SSW 1621 NE 1610 NNW 1565 ENE 1516 NNE 1316 Name: WindDir3pm, dtype: int64
# To visualise distribution of each category for WindDir3pm
plt.figure(figsize=(25, 4))
sns.countplot(Weather_subset['WindDir3pm'])
plt.xticks(rotation=-25)
plt.show()
# To show number of values in each category for RainToday
Weather_subset['RainToday'].value_counts()
No 21894 Yes 6253 Name: RainToday, dtype: int64
# To visualise distribution of each category for RainToday
plt.figure(figsize=(25, 4))
sns.countplot(Weather_subset['RainToday'])
plt.xticks(rotation=-25)
plt.show()
#to visualise missing data in subset
sns.heatmap(Weather_subset.isnull(), cbar=False)
<AxesSubplot:>
msno.heatmap(Weather_subset,figsize=(16,6))
<AxesSubplot:>
# check missing values in numerical variables
Weather_subset.isnull().sum()
Location 0 MinTemp 142 MaxTemp 77 Rainfall 291 Evaporation 12215 Sunshine 13606 WindGustDir 1874 WindGustSpeed 1867 WindDir9am 1970 WindDir3pm 766 WindSpeed9am 277 WindSpeed3pm 550 Humidity9am 356 Humidity3pm 714 Pressure9am 2878 Pressure3pm 2869 Cloud9am 10772 Cloud3pm 11450 Temp9am 190 Temp3pm 551 RainToday 291 RainTomorrow 0 Year 0 Month 0 Day 0 dtype: int64
# To show missing values for each column in number and percentage
def missing_values_table(Weather_subset):
# For total missing values
mis_val = Weather_subset.isnull().sum()
# For percentage of missing values
mis_val_percent = 100 * Weather_subset.isnull().sum() / len(Weather_subset)
# To make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# To rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# To sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# To return the dataframe with missing information
return mis_val_table_ren_columns
# Print result
missing_values_table(Weather_subset)
| Missing Values | % of Total Values | |
|---|---|---|
| Sunshine | 13606 | 47.8 |
| Evaporation | 12215 | 43.0 |
| Cloud3pm | 11450 | 40.3 |
| Cloud9am | 10772 | 37.9 |
| Pressure9am | 2878 | 10.1 |
| Pressure3pm | 2869 | 10.1 |
| WindDir9am | 1970 | 6.9 |
| WindGustDir | 1874 | 6.6 |
| WindGustSpeed | 1867 | 6.6 |
| WindDir3pm | 766 | 2.7 |
| Humidity3pm | 714 | 2.5 |
| Temp3pm | 551 | 1.9 |
| WindSpeed3pm | 550 | 1.9 |
| Humidity9am | 356 | 1.3 |
| RainToday | 291 | 1.0 |
| Rainfall | 291 | 1.0 |
| WindSpeed9am | 277 | 1.0 |
| Temp9am | 190 | 0.7 |
| MinTemp | 142 | 0.5 |
| MaxTemp | 77 | 0.3 |
Per the above there are missing values for every column. However some columns are missing substantial amounts of values such as Sunshine with almost half (47.7%) of values missing
#To show categorical columns to be investigated
print("Categorical variables are:\n ", categorical)
Categorical variables are: ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
# to visualise histograms for each cateogorical variable
plt.figure(figsize=(20,15))
cont_columns = [0,6,8,9,21]
j=0
for i in cont_columns:
j = j+1
plt.subplot(3, 2, j)
fig = Weather_subset[Weather_subset.columns[i]].hist(bins=10)
fig.set_xlabel(Weather_subset.columns[i])
fig.set_ylabel('RainTomorrow')
def catplt(variable,to):
"Function for visualization of categorical variables."
var = Weather_subset[variable]
values=var.value_counts()
f, ax = plt.subplots(figsize = (8,8))
g = sns.barplot(x = variable, y = to, data = Weather_subset)
g.set_xticklabels(g.get_xticklabels(),rotation = 90)
plt.show()
print("{}:\n{}".format(variable,values))
plt.show()
for i in categorical:
catplt(i, "RainTomorrow")
Location: Canberra 703 Sydney 681 MountGambier 651 Darwin 648 GoldCoast 634 NorfolkIsland 632 Tuggeranong 627 Penrith 624 Hobart 623 Albury 622 Brisbane 621 MountGinini 619 Perth 613 Bendigo 612 MelbourneAirport 611 Mildura 610 Nuriootpa 609 Woomera 608 Newcastle 606 CoffsHarbour 605 Sale 602 Ballarat 601 Portland 600 BadgerysCreek 598 Cobar 598 Townsville 597 Adelaide 595 SydneyAirport 595 Watsonia 591 AliceSprings 588 PerthAirport 587 Richmond 586 Launceston 583 WaggaWagga 582 Dartmoor 580 Wollongong 580 Moree 576 Cairns 576 NorahHead 568 SalmonGums 565 Albany 565 Walpole 564 Witchcliffe 563 PearceRAAF 557 Williamtown 526 Melbourne 461 Nhil 315 Uluru 303 Katherine 277 Name: Location, dtype: int64
WindGustDir: W 1957 N 1855 E 1850 SE 1849 SSE 1801 S 1787 WSW 1773 SW 1730 SSW 1703 WNW 1691 ENE 1607 NW 1596 ESE 1442 NE 1390 NNE 1267 NNW 1266 Name: WindGustDir, dtype: int64
WindDir9am: N 2321 SE 1849 E 1822 SSE 1763 S 1692 W 1688 NW 1682 SW 1651 NNW 1599 NNE 1565 ENE 1552 SSW 1521 ESE 1510 NE 1497 WNW 1412 WSW 1344 Name: WindDir9am, dtype: int64
WindDir3pm: SE 2122 W 1968 WSW 1938 S 1911 SSE 1800 SW 1770 WNW 1765 NW 1747 N 1697 E 1685 ESE 1641 SSW 1621 NE 1610 NNW 1565 ENE 1516 NNE 1316 Name: WindDir3pm, dtype: int64
RainToday: No 21894 Yes 6253 Name: RainToday, dtype: int64
#To show categorical columns to be investigated
print("Numerical variables are:\n ", numerical)
Numerical variables are: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
numerical2 = numerical[:]
numerical_hist = Weather_subset[numerical2]
# to visualise histograms for each numerical variable
numerical_hist.hist(figsize = [20,20], bins = 50)
plt.show()
#To show categorical columns to be investigated
print("Numerical variables are:\n ", numerical)
Numerical variables are: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
numerical_scatter = Weather_subset[numerical2]
# here we can see the relation between the correlation of the variables plotted in pairs
plt.figure(figsize=(20, 10))
sns.pairplot(numerical_scatter)
plt.show()
<Figure size 1440x720 with 0 Axes>
def numbplt(data,variable,to):
"Function for visualization of numerical variables."
c = sns.FacetGrid(data,col=to,height=6)
c.map(sns.distplot,variable,bins=25)
for k in numerical:
numbplt(Weather_subset, k, "RainTomorrow")
#To show numerical columns to be investigated
print("Numerical variables are:\n ", numerical)
Numerical variables are: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
# To create correlation matrix for all numerical columns
correlation_matrix = numerical_scatter.corr().round(2)
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(data=correlation_matrix, annot=True)
<AxesSubplot:>
#To show categorical columns to be investigated and compared
print("Categorical variables are:\n ", categorical)
Categorical variables are: ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
# Location v WindGustDir
ctable1 = pd.crosstab(Weather_subset.Location, Weather_subset.WindGustDir)
ctable1
| WindGustDir | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Location | ||||||||||||||||
| Adelaide | 17 | 19 | 34 | 45 | 13 | 16 | 40 | 37 | 11 | 48 | 33 | 17 | 72 | 46 | 32 | 110 |
| Albury | 22 | 30 | 25 | 46 | 37 | 24 | 32 | 31 | 17 | 56 | 37 | 10 | 25 | 99 | 86 | 45 |
| AliceSprings | 82 | 38 | 124 | 21 | 14 | 14 | 21 | 18 | 31 | 74 | 46 | 25 | 28 | 12 | 13 | 17 |
| BadgerysCreek | 66 | 63 | 49 | 28 | 28 | 37 | 7 | 23 | 34 | 27 | 18 | 25 | 55 | 40 | 33 | 62 |
| Ballarat | 2 | 1 | 30 | 94 | 5 | 51 | 35 | 22 | 60 | 78 | 37 | 23 | 32 | 56 | 30 | 41 |
| Bendigo | 6 | 20 | 3 | 55 | 34 | 42 | 24 | 41 | 41 | 31 | 102 | 24 | 47 | 43 | 40 | 53 |
| Brisbane | 107 | 94 | 60 | 7 | 66 | 27 | 7 | 3 | 8 | 59 | 40 | 9 | 11 | 71 | 23 | 27 |
| Cairns | 36 | 46 | 49 | 31 | 39 | 42 | 0 | 2 | 45 | 180 | 84 | 5 | 0 | 6 | 4 | 0 |
| Canberra | 66 | 57 | 25 | 41 | 18 | 11 | 91 | 104 | 35 | 20 | 10 | 11 | 4 | 32 | 88 | 7 |
| Cobar | 51 | 40 | 36 | 25 | 40 | 31 | 20 | 19 | 48 | 21 | 20 | 49 | 74 | 28 | 27 | 58 |
| CoffsHarbour | 24 | 15 | 9 | 28 | 71 | 65 | 19 | 14 | 60 | 17 | 26 | 71 | 68 | 4 | 12 | 37 |
| Dartmoor | 13 | 5 | 43 | 57 | 6 | 27 | 45 | 43 | 47 | 37 | 21 | 65 | 71 | 28 | 44 | 27 |
| Darwin | 113 | 44 | 56 | 68 | 17 | 28 | 47 | 66 | 10 | 31 | 14 | 13 | 9 | 26 | 79 | 15 |
| GoldCoast | 23 | 24 | 28 | 33 | 34 | 72 | 14 | 36 | 65 | 114 | 114 | 2 | 4 | 17 | 34 | 1 |
| Hobart | 5 | 4 | 28 | 32 | 8 | 8 | 127 | 120 | 29 | 48 | 23 | 50 | 51 | 27 | 30 | 26 |
| Katherine | 63 | 19 | 43 | 13 | 8 | 8 | 7 | 19 | 9 | 38 | 15 | 1 | 2 | 8 | 14 | 6 |
| Launceston | 13 | 4 | 5 | 48 | 3 | 10 | 176 | 150 | 16 | 22 | 63 | 4 | 11 | 14 | 25 | 12 |
| Melbourne | 0 | 1 | 3 | 143 | 0 | 18 | 18 | 10 | 60 | 17 | 53 | 31 | 39 | 25 | 13 | 26 |
| MelbourneAirport | 1 | 0 | 0 | 203 | 2 | 29 | 11 | 11 | 72 | 20 | 90 | 32 | 45 | 37 | 13 | 41 |
| Mildura | 7 | 10 | 18 | 52 | 25 | 35 | 25 | 37 | 56 | 41 | 54 | 65 | 43 | 48 | 43 | 50 |
| Moree | 59 | 51 | 18 | 20 | 57 | 65 | 11 | 17 | 19 | 10 | 11 | 44 | 69 | 29 | 21 | 61 |
| MountGambier | 12 | 4 | 17 | 89 | 15 | 35 | 41 | 30 | 66 | 21 | 39 | 68 | 58 | 63 | 40 | 44 |
| MountGinini | 32 | 46 | 23 | 8 | 13 | 18 | 23 | 74 | 22 | 6 | 13 | 28 | 53 | 88 | 46 | 85 |
| Nhil | 13 | 9 | 11 | 16 | 9 | 3 | 17 | 26 | 34 | 4 | 21 | 30 | 34 | 39 | 21 | 28 |
| NorahHead | 8 | 13 | 12 | 37 | 75 | 94 | 9 | 15 | 92 | 16 | 68 | 60 | 5 | 22 | 27 | 15 |
| NorfolkIsland | 99 | 58 | 43 | 20 | 30 | 33 | 10 | 17 | 31 | 61 | 47 | 40 | 38 | 33 | 32 | 30 |
| Nuriootpa | 12 | 20 | 29 | 21 | 34 | 19 | 19 | 36 | 4 | 90 | 33 | 13 | 42 | 81 | 68 | 80 |
| PearceRAAF | 105 | 55 | 47 | 12 | 12 | 11 | 17 | 11 | 22 | 12 | 1 | 31 | 70 | 27 | 25 | 74 |
| Penrith | 28 | 47 | 56 | 33 | 38 | 37 | 17 | 39 | 46 | 67 | 40 | 45 | 37 | 33 | 42 | 19 |
| Perth | 32 | 26 | 28 | 16 | 47 | 18 | 10 | 30 | 12 | 22 | 32 | 94 | 133 | 25 | 33 | 55 |
| PerthAirport | 115 | 62 | 24 | 16 | 23 | 15 | 8 | 29 | 12 | 15 | 3 | 36 | 84 | 45 | 17 | 70 |
| Portland | 30 | 11 | 35 | 39 | 6 | 19 | 35 | 26 | 31 | 50 | 15 | 30 | 37 | 99 | 65 | 68 |
| Richmond | 59 | 51 | 57 | 10 | 51 | 16 | 20 | 10 | 40 | 60 | 39 | 30 | 24 | 57 | 31 | 25 |
| Sale | 100 | 29 | 66 | 4 | 5 | 3 | 2 | 43 | 19 | 20 | 11 | 11 | 13 | 134 | 54 | 64 |
| SalmonGums | 36 | 26 | 25 | 44 | 25 | 21 | 25 | 38 | 49 | 35 | 65 | 58 | 31 | 18 | 35 | 28 |
| Sydney | 26 | 44 | 25 | 8 | 30 | 25 | 13 | 11 | 36 | 12 | 51 | 36 | 10 | 105 | 17 | 20 |
| SydneyAirport | 23 | 41 | 10 | 12 | 71 | 43 | 10 | 33 | 89 | 24 | 20 | 72 | 21 | 50 | 32 | 35 |
| Townsville | 74 | 205 | 57 | 24 | 113 | 25 | 7 | 4 | 11 | 18 | 13 | 18 | 15 | 2 | 2 | 4 |
| Tuggeranong | 32 | 53 | 21 | 67 | 32 | 19 | 51 | 69 | 44 | 11 | 37 | 18 | 7 | 58 | 88 | 15 |
| Uluru | 74 | 36 | 35 | 7 | 15 | 6 | 16 | 6 | 20 | 23 | 16 | 14 | 5 | 6 | 13 | 8 |
| WaggaWagga | 81 | 83 | 19 | 31 | 17 | 15 | 17 | 28 | 13 | 14 | 17 | 15 | 41 | 64 | 47 | 74 |
| Walpole | 8 | 12 | 41 | 17 | 41 | 22 | 17 | 56 | 20 | 92 | 36 | 72 | 18 | 34 | 37 | 31 |
| Watsonia | 2 | 18 | 4 | 105 | 16 | 22 | 38 | 22 | 37 | 11 | 4 | 109 | 66 | 64 | 11 | 54 |
| Williamtown | 38 | 31 | 31 | 7 | 23 | 5 | 0 | 42 | 52 | 48 | 52 | 29 | 9 | 13 | 116 | 9 |
| Witchcliffe | 10 | 9 | 10 | 45 | 6 | 20 | 22 | 24 | 54 | 49 | 110 | 49 | 42 | 35 | 41 | 36 |
| Wollongong | 7 | 17 | 9 | 26 | 90 | 32 | 18 | 38 | 104 | 18 | 35 | 59 | 22 | 38 | 31 | 34 |
| Woomera | 18 | 16 | 21 | 51 | 28 | 31 | 27 | 16 | 54 | 61 | 72 | 62 | 55 | 28 | 16 | 46 |
# For chi-square statistic between Location v WindGustDir
statistic, p, dof, exfr = stats.chi2_contingency(ctable1, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 16105.65. The critical value for 690 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the Location and WindGustDir variables can be rejected ""
# Location v WindDir9am
ctable2 = pd.crosstab(Weather_subset.Location, Weather_subset.WindDir9am)
ctable2
| WindDir9am | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Location | ||||||||||||||||
| Adelaide | 13 | 33 | 18 | 74 | 72 | 74 | 22 | 19 | 36 | 16 | 26 | 31 | 54 | 18 | 18 | 19 |
| Albany | 31 | 21 | 25 | 36 | 34 | 21 | 43 | 58 | 35 | 32 | 20 | 34 | 38 | 33 | 21 | 37 |
| Albury | 48 | 27 | 47 | 14 | 24 | 11 | 20 | 16 | 30 | 95 | 73 | 9 | 14 | 48 | 33 | 14 |
| AliceSprings | 97 | 41 | 95 | 31 | 31 | 22 | 36 | 14 | 15 | 47 | 27 | 15 | 15 | 18 | 13 | 16 |
| BadgerysCreek | 10 | 8 | 9 | 57 | 12 | 40 | 26 | 13 | 27 | 23 | 16 | 53 | 124 | 25 | 16 | 66 |
| Ballarat | 6 | 3 | 28 | 118 | 11 | 90 | 16 | 8 | 61 | 55 | 45 | 38 | 31 | 26 | 12 | 37 |
| Bendigo | 24 | 30 | 27 | 29 | 37 | 23 | 19 | 22 | 44 | 72 | 90 | 28 | 37 | 27 | 24 | 21 |
| Brisbane | 24 | 21 | 38 | 22 | 17 | 21 | 14 | 13 | 41 | 39 | 30 | 80 | 136 | 43 | 15 | 54 |
| Cairns | 13 | 1 | 23 | 6 | 12 | 2 | 10 | 10 | 120 | 92 | 247 | 1 | 7 | 18 | 5 | 3 |
| Canberra | 36 | 23 | 39 | 39 | 17 | 18 | 68 | 66 | 41 | 72 | 66 | 28 | 20 | 14 | 31 | 7 |
| Cobar | 61 | 58 | 44 | 35 | 56 | 49 | 22 | 11 | 49 | 44 | 37 | 45 | 25 | 12 | 17 | 27 |
| CoffsHarbour | 7 | 3 | 7 | 40 | 13 | 17 | 48 | 57 | 30 | 5 | 11 | 39 | 137 | 18 | 28 | 84 |
| Dartmoor | 33 | 28 | 30 | 80 | 16 | 42 | 47 | 24 | 23 | 18 | 23 | 29 | 17 | 18 | 31 | 23 |
| Darwin | 86 | 48 | 76 | 19 | 48 | 25 | 19 | 12 | 36 | 82 | 36 | 25 | 22 | 50 | 34 | 22 |
| GoldCoast | 26 | 10 | 24 | 32 | 20 | 18 | 34 | 96 | 97 | 47 | 115 | 31 | 10 | 25 | 26 | 13 |
| Hobart | 15 | 9 | 12 | 86 | 12 | 24 | 215 | 86 | 21 | 9 | 17 | 32 | 29 | 14 | 16 | 18 |
| Katherine | 19 | 12 | 46 | 7 | 6 | 3 | 23 | 32 | 6 | 46 | 8 | 3 | 6 | 6 | 38 | 8 |
| Launceston | 48 | 21 | 30 | 40 | 10 | 16 | 102 | 79 | 9 | 47 | 38 | 7 | 5 | 10 | 25 | 4 |
| Melbourne | 5 | 3 | 2 | 152 | 8 | 22 | 22 | 17 | 18 | 20 | 25 | 27 | 42 | 31 | 17 | 43 |
| MelbourneAirport | 0 | 3 | 1 | 225 | 6 | 17 | 32 | 18 | 29 | 20 | 23 | 39 | 52 | 53 | 31 | 56 |
| Mildura | 32 | 25 | 21 | 40 | 38 | 63 | 25 | 25 | 67 | 48 | 71 | 44 | 30 | 24 | 20 | 23 |
| Moree | 91 | 70 | 23 | 38 | 82 | 106 | 8 | 5 | 12 | 18 | 9 | 35 | 25 | 12 | 6 | 17 |
| MountGambier | 28 | 19 | 19 | 93 | 38 | 64 | 63 | 29 | 26 | 43 | 35 | 27 | 53 | 36 | 24 | 36 |
| MountGinini | 39 | 35 | 32 | 9 | 28 | 17 | 23 | 52 | 14 | 10 | 8 | 34 | 54 | 101 | 59 | 61 |
| Newcastle | 0 | 0 | 0 | 32 | 49 | 0 | 3 | 118 | 3 | 80 | 0 | 1 | 34 | 8 | 0 | 0 |
| Nhil | 15 | 25 | 7 | 28 | 20 | 16 | 22 | 18 | 34 | 6 | 25 | 21 | 23 | 21 | 12 | 13 |
| NorahHead | 6 | 14 | 14 | 39 | 22 | 45 | 38 | 47 | 39 | 11 | 22 | 63 | 46 | 35 | 43 | 63 |
| NorfolkIsland | 87 | 38 | 72 | 13 | 45 | 36 | 13 | 31 | 46 | 69 | 32 | 35 | 25 | 22 | 29 | 32 |
| Nuriootpa | 43 | 61 | 44 | 45 | 101 | 46 | 16 | 18 | 6 | 31 | 12 | 17 | 27 | 43 | 42 | 27 |
| PearceRAAF | 95 | 60 | 49 | 56 | 34 | 39 | 24 | 13 | 44 | 27 | 26 | 16 | 13 | 14 | 12 | 16 |
| Penrith | 9 | 12 | 19 | 84 | 14 | 31 | 35 | 16 | 68 | 27 | 36 | 90 | 44 | 12 | 8 | 11 |
| Perth | 99 | 70 | 52 | 28 | 75 | 44 | 4 | 9 | 31 | 43 | 41 | 15 | 20 | 19 | 15 | 22 |
| PerthAirport | 82 | 115 | 25 | 38 | 65 | 58 | 9 | 19 | 28 | 21 | 19 | 38 | 21 | 12 | 13 | 21 |
| Portland | 28 | 23 | 28 | 94 | 16 | 31 | 70 | 31 | 30 | 31 | 27 | 25 | 47 | 39 | 24 | 43 |
| Richmond | 16 | 25 | 20 | 31 | 44 | 42 | 24 | 10 | 41 | 16 | 20 | 63 | 33 | 13 | 11 | 18 |
| Sale | 27 | 37 | 13 | 23 | 14 | 25 | 34 | 70 | 8 | 6 | 13 | 16 | 16 | 133 | 120 | 30 |
| SalmonGums | 32 | 45 | 19 | 61 | 48 | 31 | 39 | 34 | 25 | 29 | 21 | 28 | 27 | 48 | 39 | 29 |
| Sydney | 24 | 11 | 35 | 21 | 15 | 20 | 21 | 21 | 38 | 17 | 23 | 30 | 11 | 264 | 94 | 26 |
| SydneyAirport | 10 | 7 | 6 | 36 | 14 | 32 | 48 | 106 | 49 | 15 | 17 | 56 | 23 | 60 | 73 | 41 |
| Townsville | 33 | 50 | 92 | 12 | 28 | 18 | 23 | 17 | 33 | 102 | 65 | 50 | 23 | 10 | 6 | 9 |
| Tuggeranong | 16 | 28 | 15 | 42 | 23 | 17 | 37 | 69 | 51 | 15 | 33 | 27 | 26 | 40 | 47 | 28 |
| Uluru | 74 | 24 | 72 | 7 | 12 | 5 | 5 | 4 | 13 | 52 | 12 | 11 | 2 | 2 | 2 | 0 |
| WaggaWagga | 143 | 149 | 37 | 25 | 29 | 13 | 26 | 18 | 4 | 18 | 16 | 2 | 10 | 25 | 18 | 23 |
| Walpole | 48 | 59 | 27 | 41 | 26 | 32 | 35 | 53 | 14 | 33 | 24 | 17 | 22 | 24 | 48 | 23 |
| Watsonia | 30 | 70 | 11 | 64 | 58 | 49 | 32 | 15 | 21 | 8 | 28 | 26 | 36 | 49 | 14 | 48 |
| Williamtown | 6 | 11 | 9 | 27 | 29 | 23 | 35 | 93 | 23 | 14 | 20 | 16 | 19 | 36 | 124 | 27 |
| Witchcliffe | 38 | 23 | 44 | 56 | 14 | 30 | 9 | 25 | 34 | 86 | 49 | 23 | 25 | 22 | 24 | 23 |
| Wollongong | 18 | 13 | 17 | 42 | 28 | 35 | 29 | 35 | 59 | 22 | 21 | 71 | 76 | 31 | 23 | 37 |
| Woomera | 51 | 30 | 67 | 54 | 26 | 42 | 11 | 10 | 63 | 70 | 65 | 30 | 19 | 26 | 11 | 25 |
# For chi-square statistic between Location v WindDir9am
statistic, p, dof, exfr = stats.chi2_contingency(ctable2, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 17533.43. The critical value for 720 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the Location and WindDir9am variables can be rejected ""
# Location v WindDir3pm
ctable3 = pd.crosstab(Weather_subset.Location, Weather_subset.WindDir3pm)
ctable3
| WindDir3pm | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Location | ||||||||||||||||
| Adelaide | 8 | 8 | 13 | 26 | 15 | 16 | 34 | 53 | 23 | 19 | 31 | 27 | 95 | 52 | 35 | 136 |
| Albany | 41 | 5 | 48 | 22 | 4 | 5 | 16 | 19 | 24 | 42 | 13 | 24 | 39 | 33 | 15 | 54 |
| Albury | 21 | 16 | 29 | 38 | 30 | 28 | 27 | 45 | 32 | 44 | 49 | 18 | 23 | 87 | 67 | 64 |
| AliceSprings | 60 | 19 | 116 | 25 | 22 | 15 | 19 | 17 | 36 | 92 | 59 | 36 | 20 | 16 | 18 | 15 |
| BadgerysCreek | 56 | 57 | 46 | 41 | 50 | 57 | 17 | 17 | 28 | 40 | 24 | 30 | 22 | 28 | 34 | 39 |
| Ballarat | 3 | 5 | 13 | 95 | 7 | 35 | 49 | 30 | 55 | 48 | 38 | 57 | 49 | 44 | 32 | 37 |
| Bendigo | 16 | 24 | 15 | 44 | 36 | 36 | 44 | 52 | 37 | 22 | 51 | 38 | 64 | 42 | 44 | 41 |
| Brisbane | 89 | 103 | 80 | 7 | 98 | 42 | 3 | 7 | 19 | 50 | 16 | 10 | 12 | 41 | 22 | 18 |
| Cairns | 42 | 43 | 104 | 42 | 55 | 53 | 4 | 0 | 22 | 151 | 53 | 2 | 1 | 2 | 0 | 1 |
| Canberra | 22 | 34 | 22 | 41 | 21 | 18 | 101 | 108 | 23 | 23 | 19 | 19 | 14 | 39 | 125 | 21 |
| Cobar | 30 | 27 | 27 | 31 | 28 | 34 | 21 | 26 | 53 | 21 | 35 | 62 | 71 | 48 | 28 | 51 |
| CoffsHarbour | 47 | 34 | 32 | 17 | 99 | 61 | 12 | 2 | 74 | 44 | 59 | 33 | 19 | 5 | 5 | 13 |
| Dartmoor | 13 | 12 | 19 | 44 | 9 | 30 | 50 | 36 | 38 | 34 | 21 | 74 | 75 | 37 | 41 | 40 |
| Darwin | 41 | 34 | 30 | 61 | 20 | 34 | 111 | 86 | 8 | 24 | 6 | 8 | 7 | 42 | 119 | 16 |
| GoldCoast | 33 | 43 | 49 | 29 | 41 | 100 | 15 | 21 | 40 | 111 | 103 | 2 | 2 | 14 | 17 | 3 |
| Hobart | 18 | 13 | 67 | 33 | 15 | 14 | 85 | 93 | 26 | 99 | 25 | 34 | 39 | 20 | 28 | 11 |
| Katherine | 51 | 18 | 52 | 4 | 9 | 3 | 9 | 13 | 8 | 39 | 17 | 3 | 6 | 15 | 19 | 9 |
| Launceston | 6 | 7 | 12 | 39 | 5 | 9 | 189 | 145 | 16 | 26 | 35 | 9 | 5 | 16 | 43 | 9 |
| Melbourne | 3 | 4 | 6 | 71 | 11 | 24 | 30 | 19 | 104 | 22 | 35 | 43 | 25 | 22 | 16 | 23 |
| MelbourneAirport | 5 | 3 | 9 | 110 | 8 | 32 | 42 | 32 | 127 | 22 | 72 | 32 | 31 | 30 | 19 | 35 |
| Mildura | 15 | 12 | 18 | 44 | 17 | 31 | 51 | 38 | 70 | 38 | 33 | 48 | 49 | 47 | 41 | 56 |
| Moree | 32 | 30 | 19 | 57 | 25 | 37 | 24 | 23 | 37 | 20 | 17 | 34 | 78 | 46 | 34 | 56 |
| MountGambier | 8 | 9 | 12 | 61 | 15 | 29 | 62 | 31 | 68 | 21 | 52 | 54 | 65 | 71 | 36 | 51 |
| MountGinini | 24 | 42 | 12 | 21 | 14 | 31 | 23 | 59 | 7 | 8 | 9 | 7 | 31 | 120 | 97 | 82 |
| Newcastle | 1 | 0 | 0 | 7 | 56 | 0 | 0 | 79 | 10 | 158 | 0 | 0 | 25 | 4 | 0 | 1 |
| Nhil | 5 | 9 | 3 | 20 | 6 | 13 | 25 | 29 | 20 | 8 | 20 | 31 | 30 | 34 | 22 | 40 |
| NorahHead | 36 | 40 | 27 | 25 | 117 | 19 | 10 | 15 | 85 | 31 | 59 | 35 | 10 | 13 | 29 | 11 |
| NorfolkIsland | 66 | 40 | 60 | 20 | 25 | 30 | 9 | 27 | 42 | 97 | 46 | 36 | 40 | 29 | 37 | 28 |
| Nuriootpa | 31 | 18 | 21 | 26 | 26 | 32 | 23 | 35 | 13 | 31 | 27 | 22 | 53 | 85 | 55 | 105 |
| PearceRAAF | 44 | 23 | 49 | 19 | 16 | 7 | 20 | 14 | 32 | 30 | 26 | 55 | 71 | 43 | 22 | 81 |
| Penrith | 34 | 28 | 42 | 42 | 50 | 59 | 31 | 33 | 40 | 42 | 30 | 49 | 35 | 36 | 36 | 25 |
| Perth | 22 | 26 | 25 | 11 | 27 | 6 | 16 | 30 | 21 | 37 | 22 | 91 | 109 | 47 | 31 | 91 |
| PerthAirport | 44 | 27 | 27 | 13 | 13 | 24 | 8 | 23 | 15 | 23 | 18 | 26 | 98 | 69 | 32 | 127 |
| Portland | 33 | 7 | 37 | 28 | 10 | 13 | 36 | 25 | 53 | 48 | 20 | 52 | 23 | 98 | 52 | 64 |
| Richmond | 51 | 64 | 39 | 26 | 69 | 37 | 19 | 20 | 28 | 46 | 34 | 30 | 14 | 41 | 31 | 18 |
| Sale | 115 | 14 | 68 | 8 | 5 | 6 | 10 | 37 | 21 | 33 | 20 | 12 | 26 | 112 | 35 | 75 |
| SalmonGums | 43 | 26 | 41 | 29 | 30 | 23 | 20 | 35 | 66 | 38 | 36 | 48 | 32 | 28 | 38 | 27 |
| Sydney | 124 | 79 | 76 | 15 | 60 | 7 | 9 | 12 | 46 | 44 | 61 | 27 | 12 | 50 | 28 | 26 |
| SydneyAirport | 38 | 52 | 28 | 16 | 80 | 23 | 15 | 28 | 75 | 58 | 50 | 39 | 11 | 30 | 22 | 28 |
| Townsville | 62 | 268 | 26 | 44 | 115 | 18 | 9 | 2 | 13 | 15 | 9 | 9 | 1 | 2 | 1 | 2 |
| Tuggeranong | 31 | 18 | 9 | 71 | 24 | 23 | 94 | 103 | 25 | 15 | 21 | 11 | 16 | 53 | 85 | 21 |
| Uluru | 62 | 23 | 38 | 13 | 16 | 10 | 8 | 15 | 6 | 32 | 21 | 11 | 14 | 9 | 15 | 8 |
| WaggaWagga | 34 | 48 | 17 | 27 | 35 | 35 | 22 | 42 | 13 | 15 | 13 | 27 | 48 | 80 | 50 | 73 |
| Walpole | 15 | 10 | 40 | 10 | 14 | 8 | 21 | 39 | 47 | 72 | 88 | 36 | 41 | 37 | 45 | 35 |
| Watsonia | 9 | 9 | 9 | 83 | 19 | 38 | 42 | 20 | 30 | 7 | 26 | 97 | 105 | 36 | 12 | 43 |
| Williamtown | 46 | 33 | 63 | 13 | 20 | 10 | 16 | 39 | 37 | 70 | 44 | 24 | 6 | 20 | 68 | 14 |
| Witchcliffe | 11 | 1 | 8 | 31 | 6 | 15 | 21 | 29 | 60 | 39 | 128 | 52 | 44 | 32 | 43 | 39 |
| Wollongong | 29 | 34 | 27 | 39 | 96 | 58 | 11 | 18 | 70 | 40 | 60 | 25 | 15 | 26 | 11 | 18 |
| Woomera | 15 | 17 | 11 | 58 | 21 | 28 | 32 | 26 | 68 | 33 | 49 | 72 | 49 | 37 | 30 | 57 |
# For chi-square statistic between Location v WindDir3pm
statistic, p, dof, exfr = stats.chi2_contingency(ctable3, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 16947.44. The critical value for 720 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the Location and WindDir3pm variables can be rejected ""
# WindGustDir v WindDir9am
ctable4 = pd.crosstab(Weather_subset.WindGustDir, Weather_subset.WindDir9am)
ctable4
| WindDir9am | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| WindGustDir | ||||||||||||||||
| E | 472 | 255 | 271 | 31 | 85 | 40 | 29 | 39 | 72 | 163 | 106 | 51 | 48 | 27 | 51 | 31 |
| ENE | 234 | 309 | 165 | 49 | 170 | 66 | 39 | 47 | 58 | 105 | 75 | 46 | 50 | 31 | 32 | 40 |
| ESE | 197 | 90 | 270 | 36 | 38 | 35 | 32 | 24 | 75 | 224 | 128 | 57 | 62 | 29 | 20 | 27 |
| N | 55 | 81 | 44 | 619 | 161 | 292 | 130 | 105 | 15 | 33 | 28 | 29 | 23 | 46 | 43 | 29 |
| NE | 102 | 158 | 69 | 83 | 268 | 166 | 56 | 59 | 37 | 80 | 47 | 32 | 34 | 36 | 35 | 29 |
| NNE | 64 | 89 | 32 | 190 | 175 | 280 | 68 | 82 | 22 | 36 | 29 | 19 | 21 | 30 | 25 | 20 |
| NNW | 42 | 49 | 20 | 287 | 72 | 119 | 265 | 127 | 17 | 30 | 22 | 10 | 15 | 29 | 42 | 27 |
| NW | 52 | 61 | 29 | 234 | 86 | 98 | 284 | 291 | 28 | 32 | 24 | 26 | 26 | 66 | 125 | 36 |
| S | 58 | 40 | 56 | 37 | 26 | 33 | 37 | 44 | 399 | 117 | 242 | 250 | 178 | 62 | 48 | 80 |
| SE | 124 | 79 | 217 | 35 | 41 | 30 | 35 | 28 | 171 | 409 | 358 | 83 | 59 | 38 | 29 | 25 |
| SSE | 75 | 33 | 105 | 34 | 29 | 24 | 22 | 31 | 281 | 284 | 381 | 153 | 102 | 61 | 49 | 62 |
| SSW | 79 | 62 | 51 | 51 | 36 | 50 | 43 | 31 | 216 | 57 | 104 | 315 | 247 | 79 | 61 | 142 |
| SW | 69 | 50 | 49 | 90 | 42 | 54 | 63 | 50 | 106 | 60 | 71 | 209 | 331 | 127 | 75 | 211 |
| W | 46 | 42 | 18 | 154 | 58 | 82 | 141 | 163 | 27 | 25 | 30 | 50 | 130 | 447 | 257 | 167 |
| WNW | 39 | 47 | 24 | 176 | 54 | 89 | 189 | 273 | 26 | 33 | 23 | 33 | 44 | 169 | 309 | 65 |
| WSW | 46 | 66 | 33 | 110 | 49 | 73 | 90 | 90 | 63 | 29 | 48 | 101 | 185 | 261 | 141 | 298 |
# For chi-square statistic between WindGustDir v WindDir9am
statistic, p, dof, exfr = stats.chi2_contingency(ctable4, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 22458.35. The critical value for 225 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindGustDir and WindDir9am variables can be rejected ""
# WindGustDir v WindDir3pm
ctable5 = pd.crosstab(Weather_subset.WindGustDir, Weather_subset.WindDir3pm)
ctable5
| WindDir3pm | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| WindGustDir | ||||||||||||||||
| E | 533 | 268 | 315 | 45 | 115 | 62 | 40 | 28 | 38 | 148 | 85 | 25 | 36 | 23 | 31 | 43 |
| ENE | 245 | 481 | 100 | 69 | 285 | 102 | 43 | 28 | 24 | 48 | 30 | 21 | 19 | 37 | 31 | 29 |
| ESE | 261 | 90 | 387 | 24 | 52 | 30 | 18 | 13 | 59 | 261 | 114 | 44 | 23 | 20 | 14 | 21 |
| N | 21 | 29 | 17 | 674 | 88 | 262 | 330 | 141 | 28 | 18 | 20 | 12 | 31 | 61 | 72 | 33 |
| NE | 73 | 258 | 34 | 99 | 494 | 194 | 39 | 21 | 21 | 38 | 17 | 22 | 13 | 15 | 26 | 17 |
| NNE | 36 | 88 | 20 | 223 | 255 | 391 | 88 | 34 | 17 | 12 | 13 | 16 | 7 | 14 | 23 | 16 |
| NNW | 12 | 14 | 20 | 195 | 31 | 44 | 398 | 282 | 6 | 21 | 9 | 15 | 21 | 46 | 116 | 25 |
| NW | 10 | 23 | 25 | 97 | 24 | 37 | 281 | 520 | 12 | 22 | 15 | 13 | 21 | 120 | 314 | 53 |
| S | 32 | 26 | 66 | 23 | 31 | 26 | 19 | 21 | 579 | 133 | 335 | 262 | 123 | 23 | 22 | 52 |
| SE | 133 | 56 | 313 | 22 | 23 | 20 | 16 | 22 | 134 | 668 | 283 | 50 | 32 | 20 | 21 | 30 |
| SSE | 72 | 37 | 116 | 21 | 26 | 21 | 20 | 22 | 342 | 328 | 551 | 103 | 51 | 19 | 21 | 42 |
| SSW | 36 | 22 | 39 | 17 | 23 | 15 | 27 | 21 | 330 | 81 | 142 | 475 | 267 | 50 | 24 | 125 |
| SW | 31 | 16 | 32 | 24 | 21 | 15 | 24 | 29 | 136 | 37 | 60 | 324 | 506 | 128 | 37 | 301 |
| W | 34 | 30 | 32 | 38 | 19 | 23 | 52 | 111 | 42 | 17 | 24 | 62 | 124 | 631 | 304 | 402 |
| WNW | 21 | 10 | 16 | 53 | 19 | 25 | 103 | 276 | 13 | 21 | 19 | 28 | 58 | 354 | 547 | 118 |
| WSW | 20 | 21 | 23 | 18 | 11 | 27 | 32 | 51 | 53 | 31 | 29 | 107 | 352 | 328 | 116 | 545 |
# For chi-square statistic between WindGustDir v WindDir3pm
statistic, p, dof, exfr = stats.chi2_contingency(ctable5, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 48623.33. The critical value for 225 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindGustDir and WindDir3pm variables can be rejected ""
# WindDir9am v WindDir3pm
ctable6 = pd.crosstab(Weather_subset.WindDir9am, Weather_subset.WindDir3pm)
ctable6
| WindDir3pm | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| WindDir9am | ||||||||||||||||
| E | 295 | 221 | 209 | 81 | 117 | 84 | 70 | 69 | 64 | 158 | 98 | 76 | 85 | 63 | 54 | 59 |
| ENE | 176 | 222 | 105 | 106 | 163 | 122 | 81 | 71 | 39 | 69 | 55 | 60 | 68 | 66 | 53 | 78 |
| ESE | 245 | 148 | 229 | 45 | 85 | 44 | 28 | 41 | 75 | 211 | 134 | 44 | 65 | 27 | 27 | 50 |
| N | 48 | 54 | 41 | 447 | 134 | 194 | 369 | 252 | 49 | 36 | 46 | 51 | 77 | 173 | 199 | 120 |
| NE | 73 | 131 | 51 | 176 | 195 | 201 | 102 | 104 | 19 | 46 | 35 | 44 | 46 | 77 | 89 | 70 |
| NNE | 44 | 71 | 33 | 274 | 172 | 216 | 175 | 127 | 35 | 29 | 27 | 35 | 54 | 97 | 82 | 82 |
| NNW | 44 | 39 | 53 | 122 | 89 | 67 | 223 | 277 | 33 | 56 | 29 | 34 | 62 | 141 | 218 | 97 |
| NW | 46 | 40 | 52 | 85 | 91 | 82 | 132 | 280 | 40 | 60 | 21 | 41 | 66 | 197 | 268 | 105 |
| S | 87 | 65 | 102 | 19 | 48 | 31 | 30 | 27 | 367 | 171 | 295 | 178 | 119 | 28 | 35 | 69 |
| SE | 139 | 122 | 225 | 39 | 87 | 50 | 54 | 39 | 145 | 422 | 227 | 89 | 56 | 36 | 36 | 41 |
| SSE | 102 | 79 | 160 | 34 | 57 | 31 | 38 | 37 | 231 | 344 | 324 | 96 | 101 | 37 | 23 | 49 |
| SSW | 70 | 63 | 86 | 23 | 50 | 17 | 20 | 23 | 266 | 99 | 143 | 256 | 190 | 47 | 34 | 105 |
| SW | 56 | 63 | 67 | 23 | 56 | 24 | 20 | 30 | 224 | 107 | 120 | 229 | 258 | 109 | 41 | 183 |
| W | 75 | 38 | 65 | 39 | 54 | 32 | 31 | 81 | 80 | 55 | 58 | 101 | 168 | 352 | 165 | 276 |
| WNW | 60 | 39 | 39 | 41 | 41 | 22 | 43 | 130 | 49 | 50 | 54 | 55 | 73 | 259 | 273 | 177 |
| WSW | 54 | 38 | 40 | 27 | 38 | 26 | 18 | 29 | 108 | 51 | 62 | 144 | 197 | 163 | 67 | 269 |
# For chi-square statistic between WindDir9am v WindDir3pm
statistic, p, dof, exfr = stats.chi2_contingency(ctable6, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 16031.99. The critical value for 225 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindDir9am and WindDir3pm variables can be rejected ""
# RainToday v Location
ctable7 = pd.crosstab(Weather_subset.RainToday, Weather_subset.Location)
ctable7
| Location | Adelaide | Albany | Albury | AliceSprings | BadgerysCreek | Ballarat | Bendigo | Brisbane | Cairns | Canberra | ... | Townsville | Tuggeranong | Uluru | WaggaWagga | Walpole | Watsonia | Williamtown | Witchcliffe | Wollongong | Woomera |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RainToday | |||||||||||||||||||||
| No | 450 | 406 | 481 | 534 | 479 | 443 | 491 | 479 | 394 | 582 | ... | 500 | 496 | 271 | 477 | 383 | 431 | 360 | 396 | 439 | 563 |
| Yes | 131 | 150 | 138 | 52 | 107 | 154 | 120 | 137 | 177 | 118 | ... | 96 | 121 | 27 | 103 | 174 | 155 | 129 | 164 | 132 | 43 |
2 rows × 49 columns
# For chi-square statistic between RainToday v Location
statistic, p, dof, exfr = stats.chi2_contingency(ctable7, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 701.13. The critical value for 48 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindDir9am and WindDir3pm variables can be rejected ""
# RainToday v WindGustDir
ctable8 = pd.crosstab(Weather_subset.RainToday, Weather_subset.WindGustDir)
ctable8
| WindGustDir | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RainToday | ||||||||||||||||
| No | 1565 | 1373 | 1186 | 1545 | 1185 | 1077 | 1001 | 1220 | 1283 | 1475 | 1382 | 1197 | 1206 | 1343 | 1217 | 1245 |
| Yes | 267 | 219 | 247 | 293 | 185 | 182 | 254 | 360 | 490 | 356 | 402 | 491 | 503 | 600 | 456 | 509 |
# For chi-square statistic between RainToday v WindGustDir
statistic, p, dof, exfr = stats.chi2_contingency(ctable8, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 591.57. The critical value for 15 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindDir9am and WindDir3pm variables can be rejected ""
# RainToday v WindDir9am
ctable9 = pd.crosstab(Weather_subset.RainToday, Weather_subset.WindDir9am)
ctable9
| WindDir9am | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RainToday | ||||||||||||||||
| No | 1609 | 1353 | 1297 | 1893 | 1273 | 1339 | 1197 | 1251 | 1205 | 1515 | 1377 | 1024 | 1094 | 1112 | 959 | 857 |
| Yes | 206 | 189 | 206 | 397 | 211 | 210 | 395 | 420 | 472 | 324 | 368 | 489 | 542 | 562 | 433 | 475 |
# For chi-square statistic between RainToday v WindDir9am
statistic, p, dof, exfr = stats.chi2_contingency(ctable9, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 1022.37. The critical value for 15 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindDir9am and WindDir3pm variables can be rejected ""
# RainToday v WindDir3pm
ctable10 = pd.crosstab(Weather_subset.RainToday, Weather_subset.WindDir3pm)
ctable10
| WindDir3pm | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| RainToday | ||||||||||||||||
| No | 1387 | 1278 | 1342 | 1422 | 1367 | 1141 | 1279 | 1345 | 1378 | 1658 | 1339 | 1112 | 1275 | 1380 | 1247 | 1395 |
| Yes | 280 | 222 | 287 | 259 | 226 | 161 | 272 | 387 | 513 | 442 | 445 | 493 | 486 | 567 | 497 | 525 |
# For chi-square statistic between RainToday v WindDir3pm
statistic, p, dof, exfr = stats.chi2_contingency(ctable10, correction=True)
print('The chi square statistic is {:.2f}. The critical value for {} degrees of freedom at an alpha level of 0.05 is {:.2f}.'.
format(statistic, dof, stats.chi2.ppf(0.95, 1)))
The chi square statistic is 550.02. The critical value for 15 degrees of freedom at an alpha level of 0.05 is 3.84.
"" As the chi-square statistic is more extreme than the critical value, i.e. it lies in the rejection region, the assumption of independence of the WindDir9am and WindDir3pm variables can be rejected ""
# To show number of values for if it will rain (= 0) or if it will not rain (= 1) RainTomorrow
Weather_subset['RainTomorrow'].value_counts()
0 22041 1 6397 Name: RainTomorrow, dtype: int64
## To show the percentage of 'No' V's 'Yes'
Weather_subset['RainTomorrow'].value_counts()/len(Weather)
0 0.155008 1 0.044988 Name: RainTomorrow, dtype: float64
# To visualise the above
f, ax = plt.subplots(figsize=(6, 8))
ax = sns.countplot(x="RainTomorrow", data=Weather_subset, palette="Set2")
plt.show()
# To show trend of rain/no rain over the last years
Rain_Yes = Weather_subset[Weather_subset.RainToday == "Yes"]
Rain_No = Weather_subset[Weather_subset.RainToday == "No"]
sns.lineplot(Rain_Yes.Year, Weather_subset.Pressure3pm, ci = None)
sns.lineplot(Rain_No.Year, Weather_subset.Pressure3pm, ci = None)
plt.legend(["Day with Rain", 'Day without Rain'])
plt.show()
#Here we can see that the dataset posses a small quantity of entrace on 2008, with a big amount on 2009 and 2010, with a small decrease of entrace until 2015, where it drops on 2016 and radically reduce the quantity of intomation given on it on 2015
sns.distplot(Weather_subset['Year'])
plt.show()
# To visualise relationship between each Location and if it will rain tomorrow
ctable7 = pd.crosstab(Weather_subset['Location'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
_=ctable7.plot.bar(stacked=True, figsize=(15,8))
# To show numeric distribution between each Location and whether or not it rained tomorrow
ctable8 = pd.crosstab(Weather_subset['Location'], Weather_subset['RainTomorrow'])
ctable8
| RainTomorrow | 0 | 1 |
|---|---|---|
| Location | ||
| Adelaide | 466 | 129 |
| Albany | 386 | 179 |
| Albury | 493 | 129 |
| AliceSprings | 540 | 48 |
| BadgerysCreek | 480 | 118 |
| Ballarat | 452 | 149 |
| Bendigo | 497 | 115 |
| Brisbane | 476 | 145 |
| Cairns | 399 | 177 |
| Canberra | 574 | 129 |
| Cobar | 521 | 77 |
| CoffsHarbour | 402 | 203 |
| Dartmoor | 417 | 163 |
| Darwin | 467 | 181 |
| GoldCoast | 462 | 172 |
| Hobart | 474 | 149 |
| Katherine | 234 | 43 |
| Launceston | 438 | 145 |
| Melbourne | 355 | 106 |
| MelbourneAirport | 482 | 129 |
| Mildura | 554 | 56 |
| Moree | 491 | 85 |
| MountGambier | 437 | 214 |
| MountGinini | 438 | 181 |
| Newcastle | 464 | 142 |
| Nhil | 258 | 57 |
| NorahHead | 414 | 154 |
| NorfolkIsland | 438 | 194 |
| Nuriootpa | 491 | 118 |
| PearceRAAF | 460 | 97 |
| Penrith | 498 | 126 |
| Perth | 489 | 124 |
| PerthAirport | 475 | 112 |
| Portland | 379 | 221 |
| Richmond | 479 | 107 |
| Sale | 470 | 132 |
| SalmonGums | 460 | 105 |
| Sydney | 506 | 175 |
| SydneyAirport | 435 | 160 |
| Townsville | 502 | 95 |
| Tuggeranong | 506 | 121 |
| Uluru | 275 | 28 |
| WaggaWagga | 481 | 101 |
| Walpole | 379 | 185 |
| Watsonia | 448 | 143 |
| Williamtown | 394 | 132 |
| Witchcliffe | 391 | 172 |
| Wollongong | 444 | 136 |
| Woomera | 570 | 38 |
# To show percentage distribution between each Location and whether or not it rained tomorrow
pd.crosstab(Weather_subset['Location'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
| RainTomorrow | 0 | 1 |
|---|---|---|
| Location | ||
| Adelaide | 0.783193 | 0.216807 |
| Albany | 0.683186 | 0.316814 |
| Albury | 0.792605 | 0.207395 |
| AliceSprings | 0.918367 | 0.081633 |
| BadgerysCreek | 0.802676 | 0.197324 |
| Ballarat | 0.752080 | 0.247920 |
| Bendigo | 0.812092 | 0.187908 |
| Brisbane | 0.766506 | 0.233494 |
| Cairns | 0.692708 | 0.307292 |
| Canberra | 0.816501 | 0.183499 |
| Cobar | 0.871237 | 0.128763 |
| CoffsHarbour | 0.664463 | 0.335537 |
| Dartmoor | 0.718966 | 0.281034 |
| Darwin | 0.720679 | 0.279321 |
| GoldCoast | 0.728707 | 0.271293 |
| Hobart | 0.760835 | 0.239165 |
| Katherine | 0.844765 | 0.155235 |
| Launceston | 0.751286 | 0.248714 |
| Melbourne | 0.770065 | 0.229935 |
| MelbourneAirport | 0.788871 | 0.211129 |
| Mildura | 0.908197 | 0.091803 |
| Moree | 0.852431 | 0.147569 |
| MountGambier | 0.671275 | 0.328725 |
| MountGinini | 0.707593 | 0.292407 |
| Newcastle | 0.765677 | 0.234323 |
| Nhil | 0.819048 | 0.180952 |
| NorahHead | 0.728873 | 0.271127 |
| NorfolkIsland | 0.693038 | 0.306962 |
| Nuriootpa | 0.806240 | 0.193760 |
| PearceRAAF | 0.825853 | 0.174147 |
| Penrith | 0.798077 | 0.201923 |
| Perth | 0.797716 | 0.202284 |
| PerthAirport | 0.809199 | 0.190801 |
| Portland | 0.631667 | 0.368333 |
| Richmond | 0.817406 | 0.182594 |
| Sale | 0.780731 | 0.219269 |
| SalmonGums | 0.814159 | 0.185841 |
| Sydney | 0.743025 | 0.256975 |
| SydneyAirport | 0.731092 | 0.268908 |
| Townsville | 0.840871 | 0.159129 |
| Tuggeranong | 0.807018 | 0.192982 |
| Uluru | 0.907591 | 0.092409 |
| WaggaWagga | 0.826460 | 0.173540 |
| Walpole | 0.671986 | 0.328014 |
| Watsonia | 0.758037 | 0.241963 |
| Williamtown | 0.749049 | 0.250951 |
| Witchcliffe | 0.694494 | 0.305506 |
| Wollongong | 0.765517 | 0.234483 |
| Woomera | 0.937500 | 0.062500 |
# To visualise relationship between each WindGustDir and if it will rain tomorrow
ctable9 = pd.crosstab(Weather_subset['WindGustDir'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
_=ctable9.plot.bar(stacked=True, figsize=(15,8))
# To show numeric distribution between each WindGustDir and whether or not it rained tomorrow
ctable10 = pd.crosstab(Weather_subset['WindGustDir'], Weather_subset['RainTomorrow'])
ctable10
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindGustDir | ||
| E | 1568 | 282 |
| ENE | 1356 | 251 |
| ESE | 1213 | 229 |
| N | 1348 | 507 |
| NE | 1136 | 254 |
| NNE | 985 | 282 |
| NNW | 905 | 361 |
| NW | 1166 | 430 |
| S | 1385 | 402 |
| SE | 1506 | 343 |
| SSE | 1430 | 371 |
| SSW | 1325 | 378 |
| SW | 1352 | 378 |
| W | 1432 | 525 |
| WNW | 1190 | 501 |
| WSW | 1387 | 386 |
# To show percentage distribution between each WindGustDir and whether or not it rained tomorrow
pd.crosstab(Weather_subset['WindGustDir'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindGustDir | ||
| E | 0.847568 | 0.152432 |
| ENE | 0.843808 | 0.156192 |
| ESE | 0.841193 | 0.158807 |
| N | 0.726685 | 0.273315 |
| NE | 0.817266 | 0.182734 |
| NNE | 0.777427 | 0.222573 |
| NNW | 0.714850 | 0.285150 |
| NW | 0.730576 | 0.269424 |
| S | 0.775042 | 0.224958 |
| SE | 0.814494 | 0.185506 |
| SSE | 0.794003 | 0.205997 |
| SSW | 0.778039 | 0.221961 |
| SW | 0.781503 | 0.218497 |
| W | 0.731732 | 0.268268 |
| WNW | 0.703726 | 0.296274 |
| WSW | 0.782290 | 0.217710 |
# To visualise relationshi between each WindDir9am and if it will rain tomorrow
ctable11 = pd.crosstab(Weather_subset['WindDir9am'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
_=ctable11.plot.bar(stacked=True, figsize=(15,8))
# To show numeric distribution between each WindDir9am and whether or not it rained tomorrow
ctable12 = pd.crosstab(Weather_subset['WindDir9am'], Weather_subset['RainTomorrow'])
ctable12
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindDir9am | ||
| E | 1549 | 273 |
| ENE | 1302 | 250 |
| ESE | 1303 | 207 |
| N | 1627 | 694 |
| NE | 1198 | 299 |
| NNE | 1138 | 427 |
| NNW | 1090 | 509 |
| NW | 1223 | 459 |
| S | 1350 | 342 |
| SE | 1549 | 300 |
| SSE | 1451 | 312 |
| SSW | 1189 | 332 |
| SW | 1292 | 359 |
| W | 1250 | 438 |
| WNW | 978 | 434 |
| WSW | 987 | 357 |
# To show percentage distribution between each WindDir9am and whether or not it rained tomorrow
pd.crosstab(Weather_subset['WindDir9am'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindDir9am | ||
| E | 0.850165 | 0.149835 |
| ENE | 0.838918 | 0.161082 |
| ESE | 0.862914 | 0.137086 |
| N | 0.700991 | 0.299009 |
| NE | 0.800267 | 0.199733 |
| NNE | 0.727157 | 0.272843 |
| NNW | 0.681676 | 0.318324 |
| NW | 0.727111 | 0.272889 |
| S | 0.797872 | 0.202128 |
| SE | 0.837750 | 0.162250 |
| SSE | 0.823029 | 0.176971 |
| SSW | 0.781723 | 0.218277 |
| SW | 0.782556 | 0.217444 |
| W | 0.740521 | 0.259479 |
| WNW | 0.692635 | 0.307365 |
| WSW | 0.734375 | 0.265625 |
# To visualise relationshi between each WindDir3pm and if it will rain tomorrow
ctable13 = pd.crosstab(Weather_subset['WindDir3pm'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
_=ctable13.plot.bar(stacked=True, figsize=(15,8))
# To show numeric distribution between each WindDir3pm and whether or not it rained tomorrow
ctable14 = pd.crosstab(Weather_subset['WindDir3pm'], Weather_subset['RainTomorrow'])
ctable14
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindDir3pm | ||
| E | 1390 | 295 |
| ENE | 1260 | 256 |
| ESE | 1366 | 275 |
| N | 1223 | 474 |
| NE | 1317 | 293 |
| NNE | 1008 | 308 |
| NNW | 1112 | 453 |
| NW | 1250 | 497 |
| S | 1506 | 405 |
| SE | 1714 | 408 |
| SSE | 1429 | 371 |
| SSW | 1228 | 393 |
| SW | 1420 | 350 |
| W | 1477 | 491 |
| WNW | 1281 | 484 |
| WSW | 1504 | 434 |
# To show percentage distribution between each WindDir3pm and whether or not it rained tomorrow
pd.crosstab(Weather_subset['WindDir3pm'], Weather_subset['RainTomorrow']).apply(lambda r: r/r.sum(), axis=1)
| RainTomorrow | 0 | 1 |
|---|---|---|
| WindDir3pm | ||
| E | 0.824926 | 0.175074 |
| ENE | 0.831135 | 0.168865 |
| ESE | 0.832419 | 0.167581 |
| N | 0.720684 | 0.279316 |
| NE | 0.818012 | 0.181988 |
| NNE | 0.765957 | 0.234043 |
| NNW | 0.710543 | 0.289457 |
| NW | 0.715512 | 0.284488 |
| S | 0.788069 | 0.211931 |
| SE | 0.807729 | 0.192271 |
| SSE | 0.793889 | 0.206111 |
| SSW | 0.757557 | 0.242443 |
| SW | 0.802260 | 0.197740 |
| W | 0.750508 | 0.249492 |
| WNW | 0.725779 | 0.274221 |
| WSW | 0.776058 | 0.223942 |
# check number of missing values by column in Dataset
Weather_subset.isnull().sum()
Location 0 MinTemp 142 MaxTemp 77 Rainfall 291 Evaporation 12215 Sunshine 13606 WindGustDir 1874 WindGustSpeed 1867 WindDir9am 1970 WindDir3pm 766 WindSpeed9am 277 WindSpeed3pm 550 Humidity9am 356 Humidity3pm 714 Pressure9am 2878 Pressure3pm 2869 Cloud9am 10772 Cloud3pm 11450 Temp9am 190 Temp3pm 551 RainToday 291 RainTomorrow 0 Year 0 Month 0 Day 0 dtype: int64
# columns with missing values
[(col,Weather_subset[col].isnull().sum()) for
col in Weather_subset.columns if Weather_subset[col].isnull().any()]
[('MinTemp', 142),
('MaxTemp', 77),
('Rainfall', 291),
('Evaporation', 12215),
('Sunshine', 13606),
('WindGustDir', 1874),
('WindGustSpeed', 1867),
('WindDir9am', 1970),
('WindDir3pm', 766),
('WindSpeed9am', 277),
('WindSpeed3pm', 550),
('Humidity9am', 356),
('Humidity3pm', 714),
('Pressure9am', 2878),
('Pressure3pm', 2869),
('Cloud9am', 10772),
('Cloud3pm', 11450),
('Temp9am', 190),
('Temp3pm', 551),
('RainToday', 291)]
Missing data 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm'
Not missing data 'Location', 'RainToday', 'RainTomorrow', 'Year', 'Month', 'Day'
# To fill missing categorical data with the mode and missing numerical data with the median
for i in categorical:
Weather_subset[i].fillna(value=Weather_subset[i].mode()[0],inplace=True)
for k in numerical:
Weather_subset[k].fillna(value=Weather_subset[k].median(),inplace=True)
# to visualise if all missing data has now been taken care of
sns.heatmap(Weather_subset.isnull(), cbar=False)
<AxesSubplot:>
# columns with missing values
[(col,Weather_subset[col].isnull().sum()) for
col in Weather_subset.columns if Weather_subset[col].isnull().any()]
[]
#To show categorical columns to be investigated
print("Categorical Columns: ",categorical)
Categorical Columns: ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
# To dummy encode cateogrical data and drop first column to result in k-1 dummies out of k categorical levels
Weather_encoded = pd.get_dummies(data=Weather_subset,drop_first=True)
Weather_encoded
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | ... | WindDir3pm_NW | WindDir3pm_S | WindDir3pm_SE | WindDir3pm_SSE | WindDir3pm_SSW | WindDir3pm_SW | WindDir3pm_W | WindDir3pm_WNW | WindDir3pm_WSW | RainToday_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 33491 | 12.8 | 21.6 | 0.0 | 3.0 | 10.9 | 43.0 | 9.0 | 30.0 | 62.0 | 56.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16277 | 9.0 | 21.7 | 0.2 | 4.8 | 8.5 | 39.0 | 9.0 | 4.0 | 70.0 | 40.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 57069 | 10.3 | 32.9 | 0.0 | 5.6 | 8.5 | 63.0 | 13.0 | 28.0 | 33.0 | 12.0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 96349 | 15.5 | 23.6 | 0.0 | 4.8 | 8.5 | 30.0 | 9.0 | 13.0 | 74.0 | 47.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 105217 | 11.7 | 23.5 | 0.0 | 3.6 | 3.1 | 31.0 | 0.0 | 20.0 | 52.0 | 33.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 60322 | 1.1 | 10.5 | 0.0 | 1.6 | 0.2 | 26.0 | 6.0 | 13.0 | 92.0 | 96.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 80960 | 2.7 | 11.8 | 0.0 | 1.2 | 9.2 | 35.0 | 0.0 | 15.0 | 70.0 | 52.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 41969 | 10.3 | 17.3 | 0.0 | 4.8 | 8.5 | 76.0 | 17.0 | 26.0 | 50.0 | 33.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 138987 | 25.2 | 31.9 | 0.2 | 6.4 | 6.2 | 43.0 | 19.0 | 28.0 | 81.0 | 63.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 17089 | 11.9 | 26.5 | 0.0 | 4.8 | 8.5 | 39.0 | 9.0 | 19.0 | 81.0 | 52.0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
28438 rows × 114 columns
We can now run the one way ANOVA test as the NANs have been taken care of
The below code is to be run twice due to initial error
model = ols ('MinTemp ~ Location', data = Weather_subset).fit()
anova_table = sm.stats.anova_lm(model,typ=1)
print('----------------------')
print('ANOVA table for Location')
print('----------------------')
print(anova_table)
print('----------------------')
model = ols ('MinTemp ~ WindGustDir', data = Weather_subset).fit()
anova_table = sm.stats.anova_lm(model,typ=1)
anova_table
print('ANOVA table for WindGustDir')
print('----------------------')
print(anova_table)
print('----------------------')
model = ols ('MinTemp ~ WindDir9am', data = Weather_subset).fit()
anova_table = sm.stats.anova_lm(model,typ=1)
anova_table
print('ANOVA table for WindDir9am')
print('----------------------')
print(anova_table)
print('----------------------')
model = ols ('MinTemp ~ WindDir3pm', data = Weather_subset).fit()
anova_table = sm.stats.anova_lm(model,typ=1)
print('----------------------')
print('ANOVA table for WindDir3pm')
print('----------------------')
print(anova_table)
print('----------------------')
model = ols ('MinTemp ~ RainToday', data = Weather_subset).fit()
anova_table = sm.stats.anova_lm(model,typ=1)
print('----------------------')
print('ANOVA table for RainToday')
print('----------------------')
print(anova_table)
----------------------
ANOVA table for Location
----------------------
df sum_sq mean_sq F PR(>F)
Location 48.0 428389.511375 8924.781487 341.046326 0.0
Residual 28389.0 742906.762664 26.168825 NaN NaN
----------------------
ANOVA table for WindGustDir
----------------------
df sum_sq mean_sq F PR(>F)
WindGustDir 15.0 6.190209e+04 4126.806047 105.726245 2.070629e-320
Residual 28422.0 1.109394e+06 39.032939 NaN NaN
----------------------
ANOVA table for WindDir9am
----------------------
df sum_sq mean_sq F PR(>F)
WindDir9am 15.0 5.372474e+04 3581.649342 91.088252 1.745808e-275
Residual 28422.0 1.117572e+06 39.320651 NaN NaN
----------------------
----------------------
ANOVA table for WindDir3pm
----------------------
df sum_sq mean_sq F PR(>F)
WindDir3pm 15.0 6.707955e+04 4471.970057 115.106329 0.0
Residual 28422.0 1.104217e+06 38.850775 NaN NaN
----------------------
----------------------
ANOVA table for RainToday
----------------------
df sum_sq mean_sq F PR(>F)
RainToday 1.0 3.645331e+03 3645.330655 88.775351 4.746922e-21
Residual 28436.0 1.167651e+06 41.062419 NaN NaN
#To show numerical columns to be investigated
print("Numerical Columns: ",numerical)
Numerical Columns: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
# to visualise Numerical Columns using boxplots for potential outliers
plt.figure(figsize=(20,20))
cont_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
j=0
for i in cont_columns:
j = j+1
plt.subplot(4, 4, j)
fig = Weather_encoded.boxplot(column=Weather_encoded.columns[i])
fig.set_title('')
fig.set_ylabel(Weather_encoded.columns[i])
# to visualise the distribution using histograms for potential outliers
plt.figure(figsize=(20,20))
cont_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
j=0
for i in cont_columns:
j = j+1
plt.subplot(4, 4, j)
fig = Weather[Weather_encoded.columns[i]].hist(bins=10)
fig.set_xlabel(Weather_encoded.columns[i])
fig.set_ylabel('RainTomorrow')
From the above box plots and histograms we can see a quite a number of potential outliers especially for the columns Rainfall and Evaporation. We will confirm the outliers and cap using the interquartile range method
# find outliers for MinTemp variable
IQR = Weather_encoded.MinTemp.quantile(0.75) - Weather_encoded.MinTemp.quantile(0.25)
Lower_fence = Weather_encoded.MinTemp.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.MinTemp.quantile(0.75) + (IQR * 3)
print('MinTemp outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
MinTemp outliers are values < -20.0 or > 44.400000000000006
# for the minimum and maximum value for MinTemp
print("The minimum value is", MinTemp_total_min)
print("The maximum value is", MinTemp_total_max)
The minimum value is -8.2 The maximum value is 30.3
For MinTemp, the minimum and maximum values are -8.2 and 30.3 respectively. So, there are no outliers.
# find outliers for MaxTemp variable
IQR = Weather_encoded.MaxTemp.quantile(0.75) - Weather_encoded.MaxTemp.quantile(0.25)
Lower_fence = Weather_encoded.MaxTemp.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.MaxTemp.quantile(0.75) + (IQR * 3)
print('MaxTemp outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
MaxTemp outliers are values < -13.000000000000004 or > 59.1
# for the minimum and maximum value for MaxTemp
print("The minimum value is", MaxTemp_total_min)
print("The maximum value is", MaxTemp_total_max)
The minimum value is -3.1 The maximum value is 48.1
For MaxTemp, the minimum and maximum values are -3.1 and 48.1 respectively. So, there are no outliers.
# find outliers for Rainfall variable
IQR = Weather_encoded.Rainfall.quantile(0.75) - Weather_encoded.Rainfall.quantile(0.25)
Lower_fence = Weather_encoded.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Rainfall outliers are values < -1.7999999999999998 or > 2.4
# for the minimum and maximum value for Rainfall
print("The minimum value is", Rainfall_total_min)
print("The maximum value is", Rainfall_total_max)
The minimum value is 0.0 The maximum value is 371.0
For Rainfall, the minimum and maximum values are 0 and 371 respectively. So, the outliers are the values greater than 2.4. There is no lower limit.
# find outliers for Evaporation variable
IQR = Weather_encoded.Evaporation.quantile(0.75) - Weather_encoded.Evaporation.quantile(0.25)
Lower_fence = Weather_encoded.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Evaporation outliers are values < -0.20000000000000107 or > 9.600000000000001
# for the minimum and maximum value for Evaporation
print("The minimum value is", Evaporation_total_min)
print("The maximum value is", Evaporation_total_max)
The minimum value is 0.0 The maximum value is 86.2
For Evaporation, the minimum and maximum values are 0 and 86.2 respectively. So, the outliers are the values greater than 9.600000000000001. There is no lower limit.
# find outliers for WindGustSpeed variable
IQR = Weather_encoded.WindGustSpeed.quantile(0.75) - Weather_encoded.WindGustSpeed.quantile(0.25)
Lower_fence = Weather_encoded.WindGustSpeed.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.WindGustSpeed.quantile(0.75) + (IQR * 3)
print('WindGustSpeed outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
WindGustSpeed outliers are values < -14.0 or > 91.0
# for the minimum and maximum value for WindGustSpeed
print("The minimum value is", WindGustSpeed_total_min)
print("The maximum value is", WindGustSpeed_total_max)
The minimum value is 6.0 The maximum value is 126.0
For WindGustSpeed, the minimum and maximum values are 6 and 126 respectively. So, the outliers are the values greater than 91. There is no lower limit.
# find outliers for WindSpeed9am variable
IQR = Weather_encoded.MinTemp.quantile(0.75) - Weather_encoded.WindSpeed9am.quantile(0.25)
Lower_fence = Weather_encoded.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
WindSpeed9am outliers are values < -22.400000000000002 or > 48.400000000000006
# for the minimum and maximum value for WindSpeed9am
print("The minimum value is", WindSpeed9am_total_min)
print("The maximum value is", WindSpeed9am_total_max)
The minimum value is 0.0 The maximum value is 130.0
For WindSpeed9am, the minimum and maximum values are 0 and 130 respectively. So, the outliers are the values greater than 48.400000000000006. There is no lower limit.
# find outliers for WindSpeed3pm variable
IQR = Weather_encoded.WindSpeed3pm.quantile(0.75) - Weather_encoded.WindSpeed3pm.quantile(0.25)
Lower_fence = Weather_encoded.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
WindSpeed3pm outliers are values < -20.0 or > 57.0
# for the minimum and maximum value for WindSpeed3pm
print("The minimum value is", WindSpeed3pm_total_min)
print("The maximum value is", WindSpeed3pm_total_max)
The minimum value is 0.0 The maximum value is 78.0
For WindSpeed3pm, the minimum and maximum values are 0 and 78 respectively. So, the outliers are the values greater than 57. There is no lower limit.
# find outliers for Humidity9am variable
IQR = Weather_encoded.Humidity9am.quantile(0.75) - Weather_encoded.Humidity9am.quantile(0.25)
Lower_fence = Weather_encoded.Humidity9am.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Humidity9am.quantile(0.75) + (IQR * 3)
print('Humidity9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Humidity9am outliers are values < -21.0 or > 161.0
# for the minimum and maximum value for Humidity9am
print("The minimum value is", Humidity9am_total_min)
print("The maximum value is", Humidity9am_total_max)
The minimum value is 3.0 The maximum value is 100.0
For Humidity9am, the minimum and maximum values are 3 and 100 respectively. So, there are no outliers
# find outliers for Pressure9am variable
IQR = Weather_encoded.Pressure9am.quantile(0.75) - Weather_encoded.Pressure9am.quantile(0.25)
Lower_fence = Weather_encoded.Pressure9am.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Pressure9am.quantile(0.75) + (IQR * 3)
print('Pressure9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Pressure9am outliers are values < 988.6000000000001 or > 1046.6999999999998
# for the minimum and maximum value for Pressure9am
print("The minimum value is", Pressure9am_total_min)
print("The maximum value is", Pressure9am_total_max)
The minimum value is 982.0 The maximum value is 1040.4
For Pressure9am, the minimum and maximum values are 982 and 1040.4 respectively. So, the outliers are the values less than 984.4. There is no upper limit.
# find outliers for Pressure3pm variable
IQR = Weather_encoded.Pressure3pm.quantile(0.75) - Weather_encoded.Pressure3pm.quantile(0.25)
Lower_fence = Weather_encoded.Pressure3pm.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Pressure3pm.quantile(0.75) + (IQR * 3)
print('Pressure3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Pressure3pm outliers are values < 985.8000000000001 or > 1044.6
# for the minimum and maximum value for Pressure3pm
print("The minimum value is", Pressure3pm_total_min)
print("The maximum value is", Pressure3pm_total_max)
The minimum value is 977.1 The maximum value is 1038.9
For Pressure3pm, the minimum and maximum values are 977.1 and 1038.9 respectively. So, the outliers are the values less than 981.5999999999999. There is no upper limit.
# find outliers for Temp9am variable
IQR = Weather_encoded.MinTemp.quantile(0.75) - Weather_encoded.Temp9am.quantile(0.25)
Lower_fence = Weather_encoded.Temp9am.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Temp9am.quantile(0.75) + (IQR * 3)
print('Temp9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Temp9am outliers are values < -1.1999999999999993 or > 35.0
# for the minimum and maximum value for Temp9am
print("The minimum value is", Temp9am_total_min)
print("The maximum value is", Temp9am_total_max)
The minimum value is -7.0 The maximum value is 39.0
For Temp9am, the minimum and maximum values are -7 and 39 respectively. So, the outliers are values greater than 35 and less than -1.1999999999999993.
# find outliers for Temp3pm variable
IQR = Weather_encoded.Temp3pm.quantile(0.75) - Weather_encoded.Temp3pm.quantile(0.25)
Lower_fence = Weather_encoded.Temp3pm.quantile(0.25) - (IQR * 3)
Upper_fence = Weather_encoded.Temp3pm.quantile(0.75) + (IQR * 3)
print('Temp3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))
Temp3pm outliers are values < -12.100000000000005 or > 55.10000000000001
# for the minimum and maximum value for Temp3pm
print("The minimum value is", Temp3pm_total_min)
print("The maximum value is", Temp3pm_total_max)
The minimum value is -4.2 The maximum value is 46.2
For Temp3pm, the minimum and maximum values are -4.2 and 46.2 respectively. So, in this case there are no outliers.
# To put a maximum per the above figures which identified the outliers for each column
def max_value(df3, variable, top):
return np.where(df3[variable]>top, top, df3[variable])
for df3 in [Weather_encoded]:
df3['Rainfall'] = max_value(df3, 'Rainfall', 2.4)
df3['Evaporation'] = max_value(df3, 'Evaporation', 9.600000000000001)
df3['WindGustSpeed'] = max_value(df3, 'WindGustSpeed', 91)
df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 48.400000000000006)
df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)
df3['Temp9am'] = max_value(df3, 'Temp9am', 35)
# To put a minimum per the above figures which identified the outliers for each column
def min_value(df3, variable, bottom):
return np.where(df3[variable]<bottom, bottom, df3[variable])
for df3 in [Weather_encoded]:
df3['Pressure9am'] = min_value(df3, 'Pressure9am', 984.4)
df3['Pressure3pm'] = min_value(df3, 'Pressure3pm', 981.5999999999999)
df3['Temp9am'] = min_value(df3, 'Temp9am', -1.1999999999999993)
# To show new max and min forTemp9am
print("The minimum value is", Weather_encoded['Temp9am'].min())
print("The maximum value is", Weather_encoded['Temp9am'].max())
The minimum value is -1.1999999999999993 The maximum value is 35.0
# To show new max and min for Rainfall
print("The minimum value is", Weather_encoded['Rainfall'].min())
print("The maximum value is", Weather_encoded['Rainfall'].max())
The minimum value is 0.0 The maximum value is 2.4
# to visualise new Numerical Columns using boxplots
plt.figure(figsize=(20,20))
cont_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
j=0
for i in cont_columns:
j = j+1
plt.subplot(4, 4, j)
fig = Weather_encoded.boxplot(column=Weather_encoded.columns[i])
fig.set_title('')
fig.set_ylabel(Weather_encoded.columns[i])
# to visualise the new distribution using histograms
plt.figure(figsize=(20,20))
cont_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
j=0
for i in cont_columns:
j = j+1
plt.subplot(4, 4, j)
fig = Weather_encoded[Weather_encoded.columns[i]].hist(bins=10)
fig.set_xlabel(Weather_encoded.columns[i])
fig.set_ylabel('RainTomorrow')
# To define X and y (dependant variable)
X = Weather_encoded.drop(['RainTomorrow'], axis=1)
y = Weather_encoded['RainTomorrow']
# To locate the 10 best independent variables for predicting the dependant variable
MI_selector = SelectKBest(f_classif, k=10)
kbest = MI_selector.fit_transform(X, y)
X_count = X.shape[1]
kbest_count = kbest.shape[1]
format_string = 'Original variable count: {}\tReduced variable count: {}'
print(format_string.format(X_count,kbest_count))
cols_retained10 = MI_selector.get_support(indices=True)
print('Columns retained:',cols_retained10)
print('Columns retained:',X.columns[cols_retained10])
Original variable count: 113 Reduced variable count: 10
Columns retained: [ 2 4 5 8 9 10 11 12 13 112]
Columns retained: Index(['Rainfall', 'Sunshine', 'WindGustSpeed', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'RainToday_Yes'],
dtype='object')
# To locate the 5 best independent variables for predicting the dependant variable
MI_selector = SelectKBest(f_classif, k=5)
kbest = MI_selector.fit_transform(X, y)
X_count = X.shape[1]
kbest_count = kbest.shape[1]
format_string = 'Original variable count: {}\tReduced variable count: {}'
print(format_string.format(X_count,kbest_count))
cols_retained5 = MI_selector.get_support(indices=True)
print('Columns retained:',cols_retained5)
print('Columns retained:',X.columns[cols_retained5])
Original variable count: 113 Reduced variable count: 5 Columns retained: [ 2 4 9 13 112] Columns retained: Index(['Rainfall', 'Sunshine', 'Humidity3pm', 'Cloud3pm', 'RainToday_Yes'], dtype='object')
# To split and define X_train, X_test as well as y_train and y_test
train, test = train_test_split(Weather_encoded,test_size = 0.2,random_state=4242)
X_train = train.drop(['RainTomorrow'],axis=1)
y_train = train.filter(['RainTomorrow'])
X_test = test.drop(['RainTomorrow'],axis=1)
y_test = test.filter(['RainTomorrow'])
Please review the Code part 2 for ridge, lasso, PCA and Logistic Regression